1# Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2# 3# Licensed under the Apache License, Version 2.0 (the "License"); 4# you may not use this file except in compliance with the License. 5# You may obtain a copy of the License at 6# 7# http://www.apache.org/licenses/LICENSE-2.0 8# 9# Unless required by applicable law or agreed to in writing, software 10# distributed under the License is distributed on an "AS IS" BASIS, 11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12# See the License for the specific language governing permissions and 13# limitations under the License. 14# ============================================================================== 15"""Model definitions for simple speech recognition. 16 17""" 18import math 19 20import tensorflow as tf 21 22 23def _next_power_of_two(x): 24 """Calculates the smallest enclosing power of two for an input. 25 26 Args: 27 x: Positive float or integer number. 28 29 Returns: 30 Next largest power of two integer. 31 """ 32 return 1 if x == 0 else 2**(int(x) - 1).bit_length() 33 34 35def prepare_model_settings(label_count, sample_rate, clip_duration_ms, 36 window_size_ms, window_stride_ms, feature_bin_count, 37 preprocess): 38 """Calculates common settings needed for all models. 39 40 Args: 41 label_count: How many classes are to be recognized. 42 sample_rate: Number of audio samples per second. 43 clip_duration_ms: Length of each audio clip to be analyzed. 44 window_size_ms: Duration of frequency analysis window. 45 window_stride_ms: How far to move in time between frequency windows. 46 feature_bin_count: Number of frequency bins to use for analysis. 47 preprocess: How the spectrogram is processed to produce features. 48 49 Returns: 50 Dictionary containing common settings. 51 52 Raises: 53 ValueError: If the preprocessing mode isn't recognized. 54 """ 55 desired_samples = int(sample_rate * clip_duration_ms / 1000) 56 window_size_samples = int(sample_rate * window_size_ms / 1000) 57 window_stride_samples = int(sample_rate * window_stride_ms / 1000) 58 length_minus_window = (desired_samples - window_size_samples) 59 if length_minus_window < 0: 60 spectrogram_length = 0 61 else: 62 spectrogram_length = 1 + int(length_minus_window / window_stride_samples) 63 if preprocess == 'average': 64 fft_bin_count = 1 + (_next_power_of_two(window_size_samples) / 2) 65 average_window_width = int(math.floor(fft_bin_count / feature_bin_count)) 66 fingerprint_width = int(math.ceil(fft_bin_count / average_window_width)) 67 elif preprocess == 'mfcc': 68 average_window_width = -1 69 fingerprint_width = feature_bin_count 70 elif preprocess == 'micro': 71 average_window_width = -1 72 fingerprint_width = feature_bin_count 73 else: 74 raise ValueError('Unknown preprocess mode "%s" (should be "mfcc",' 75 ' "average", or "micro")' % (preprocess)) 76 fingerprint_size = fingerprint_width * spectrogram_length 77 return { 78 'desired_samples': desired_samples, 79 'window_size_samples': window_size_samples, 80 'window_stride_samples': window_stride_samples, 81 'spectrogram_length': spectrogram_length, 82 'fingerprint_width': fingerprint_width, 83 'fingerprint_size': fingerprint_size, 84 'label_count': label_count, 85 'sample_rate': sample_rate, 86 'preprocess': preprocess, 87 'average_window_width': average_window_width, 88 } 89 90 91def create_model(fingerprint_input, model_settings, model_architecture, 92 is_training, runtime_settings=None): 93 """Builds a model of the requested architecture compatible with the settings. 94 95 There are many possible ways of deriving predictions from a spectrogram 96 input, so this function provides an abstract interface for creating different 97 kinds of models in a black-box way. You need to pass in a TensorFlow node as 98 the 'fingerprint' input, and this should output a batch of 1D features that 99 describe the audio. Typically this will be derived from a spectrogram that's 100 been run through an MFCC, but in theory it can be any feature vector of the 101 size specified in model_settings['fingerprint_size']. 102 103 The function will build the graph it needs in the current TensorFlow graph, 104 and return the tensorflow output that will contain the 'logits' input to the 105 softmax prediction process. If training flag is on, it will also return a 106 placeholder node that can be used to control the dropout amount. 107 108 See the implementations below for the possible model architectures that can be 109 requested. 110 111 Args: 112 fingerprint_input: TensorFlow node that will output audio feature vectors. 113 model_settings: Dictionary of information about the model. 114 model_architecture: String specifying which kind of model to create. 115 is_training: Whether the model is going to be used for training. 116 runtime_settings: Dictionary of information about the runtime. 117 118 Returns: 119 TensorFlow node outputting logits results, and optionally a dropout 120 placeholder. 121 122 Raises: 123 Exception: If the architecture type isn't recognized. 124 """ 125 if model_architecture == 'single_fc': 126 return create_single_fc_model(fingerprint_input, model_settings, 127 is_training) 128 elif model_architecture == 'conv': 129 return create_conv_model(fingerprint_input, model_settings, is_training) 130 elif model_architecture == 'low_latency_conv': 131 return create_low_latency_conv_model(fingerprint_input, model_settings, 132 is_training) 133 elif model_architecture == 'low_latency_svdf': 134 return create_low_latency_svdf_model(fingerprint_input, model_settings, 135 is_training, runtime_settings) 136 elif model_architecture == 'tiny_conv': 137 return create_tiny_conv_model(fingerprint_input, model_settings, 138 is_training) 139 elif model_architecture == 'tiny_embedding_conv': 140 return create_tiny_embedding_conv_model(fingerprint_input, model_settings, 141 is_training) 142 else: 143 raise Exception('model_architecture argument "' + model_architecture + 144 '" not recognized, should be one of "single_fc", "conv",' + 145 ' "low_latency_conv, "low_latency_svdf",' + 146 ' "tiny_conv", or "tiny_embedding_conv"') 147 148 149def load_variables_from_checkpoint(sess, start_checkpoint): 150 """Utility function to centralize checkpoint restoration. 151 152 Args: 153 sess: TensorFlow session. 154 start_checkpoint: Path to saved checkpoint on disk. 155 """ 156 saver = tf.compat.v1.train.Saver(tf.compat.v1.global_variables()) 157 saver.restore(sess, start_checkpoint) 158 159 160def create_single_fc_model(fingerprint_input, model_settings, is_training): 161 """Builds a model with a single hidden fully-connected layer. 162 163 This is a very simple model with just one matmul and bias layer. As you'd 164 expect, it doesn't produce very accurate results, but it is very fast and 165 simple, so it's useful for sanity testing. 166 167 Here's the layout of the graph: 168 169 (fingerprint_input) 170 v 171 [MatMul]<-(weights) 172 v 173 [BiasAdd]<-(bias) 174 v 175 176 Args: 177 fingerprint_input: TensorFlow node that will output audio feature vectors. 178 model_settings: Dictionary of information about the model. 179 is_training: Whether the model is going to be used for training. 180 181 Returns: 182 TensorFlow node outputting logits results, and optionally a dropout 183 placeholder. 184 """ 185 if is_training: 186 dropout_rate = tf.compat.v1.placeholder(tf.float32, name='dropout_rate') 187 fingerprint_size = model_settings['fingerprint_size'] 188 label_count = model_settings['label_count'] 189 weights = tf.compat.v1.get_variable( 190 name='weights', 191 initializer=tf.compat.v1.truncated_normal_initializer(stddev=0.001), 192 shape=[fingerprint_size, label_count]) 193 bias = tf.compat.v1.get_variable(name='bias', 194 initializer=tf.compat.v1.zeros_initializer, 195 shape=[label_count]) 196 logits = tf.matmul(fingerprint_input, weights) + bias 197 if is_training: 198 return logits, dropout_rate 199 else: 200 return logits 201 202 203def create_conv_model(fingerprint_input, model_settings, is_training): 204 """Builds a standard convolutional model. 205 206 This is roughly the network labeled as 'cnn-trad-fpool3' in the 207 'Convolutional Neural Networks for Small-footprint Keyword Spotting' paper: 208 http://www.isca-speech.org/archive/interspeech_2015/papers/i15_1478.pdf 209 210 Here's the layout of the graph: 211 212 (fingerprint_input) 213 v 214 [Conv2D]<-(weights) 215 v 216 [BiasAdd]<-(bias) 217 v 218 [Relu] 219 v 220 [MaxPool] 221 v 222 [Conv2D]<-(weights) 223 v 224 [BiasAdd]<-(bias) 225 v 226 [Relu] 227 v 228 [MaxPool] 229 v 230 [MatMul]<-(weights) 231 v 232 [BiasAdd]<-(bias) 233 v 234 235 This produces fairly good quality results, but can involve a large number of 236 weight parameters and computations. For a cheaper alternative from the same 237 paper with slightly less accuracy, see 'low_latency_conv' below. 238 239 During training, dropout nodes are introduced after each relu, controlled by a 240 placeholder. 241 242 Args: 243 fingerprint_input: TensorFlow node that will output audio feature vectors. 244 model_settings: Dictionary of information about the model. 245 is_training: Whether the model is going to be used for training. 246 247 Returns: 248 TensorFlow node outputting logits results, and optionally a dropout 249 placeholder. 250 """ 251 if is_training: 252 dropout_rate = tf.compat.v1.placeholder(tf.float32, name='dropout_rate') 253 input_frequency_size = model_settings['fingerprint_width'] 254 input_time_size = model_settings['spectrogram_length'] 255 fingerprint_4d = tf.reshape(fingerprint_input, 256 [-1, input_time_size, input_frequency_size, 1]) 257 first_filter_width = 8 258 first_filter_height = 20 259 first_filter_count = 64 260 first_weights = tf.compat.v1.get_variable( 261 name='first_weights', 262 initializer=tf.compat.v1.truncated_normal_initializer(stddev=0.01), 263 shape=[first_filter_height, first_filter_width, 1, first_filter_count]) 264 first_bias = tf.compat.v1.get_variable( 265 name='first_bias', 266 initializer=tf.compat.v1.zeros_initializer, 267 shape=[first_filter_count]) 268 269 first_conv = tf.nn.conv2d(input=fingerprint_4d, 270 filters=first_weights, 271 strides=[1, 1, 1, 1], 272 padding='SAME') + first_bias 273 first_relu = tf.nn.relu(first_conv) 274 if is_training: 275 first_dropout = tf.nn.dropout(first_relu, rate=dropout_rate) 276 else: 277 first_dropout = first_relu 278 max_pool = tf.nn.max_pool2d(input=first_dropout, 279 ksize=[1, 2, 2, 1], 280 strides=[1, 2, 2, 1], 281 padding='SAME') 282 second_filter_width = 4 283 second_filter_height = 10 284 second_filter_count = 64 285 second_weights = tf.compat.v1.get_variable( 286 name='second_weights', 287 initializer=tf.compat.v1.truncated_normal_initializer(stddev=0.01), 288 shape=[ 289 second_filter_height, second_filter_width, first_filter_count, 290 second_filter_count 291 ]) 292 second_bias = tf.compat.v1.get_variable( 293 name='second_bias', 294 initializer=tf.compat.v1.zeros_initializer, 295 shape=[second_filter_count]) 296 second_conv = tf.nn.conv2d(input=max_pool, 297 filters=second_weights, 298 strides=[1, 1, 1, 1], 299 padding='SAME') + second_bias 300 second_relu = tf.nn.relu(second_conv) 301 if is_training: 302 second_dropout = tf.nn.dropout(second_relu, rate=dropout_rate) 303 else: 304 second_dropout = second_relu 305 second_conv_shape = second_dropout.get_shape() 306 second_conv_output_width = second_conv_shape[2] 307 second_conv_output_height = second_conv_shape[1] 308 second_conv_element_count = int( 309 second_conv_output_width * second_conv_output_height * 310 second_filter_count) 311 flattened_second_conv = tf.reshape(second_dropout, 312 [-1, second_conv_element_count]) 313 label_count = model_settings['label_count'] 314 final_fc_weights = tf.compat.v1.get_variable( 315 name='final_fc_weights', 316 initializer=tf.compat.v1.truncated_normal_initializer(stddev=0.01), 317 shape=[second_conv_element_count, label_count]) 318 final_fc_bias = tf.compat.v1.get_variable( 319 name='final_fc_bias', 320 initializer=tf.compat.v1.zeros_initializer, 321 shape=[label_count]) 322 final_fc = tf.matmul(flattened_second_conv, final_fc_weights) + final_fc_bias 323 if is_training: 324 return final_fc, dropout_rate 325 else: 326 return final_fc 327 328 329def create_low_latency_conv_model(fingerprint_input, model_settings, 330 is_training): 331 """Builds a convolutional model with low compute requirements. 332 333 This is roughly the network labeled as 'cnn-one-fstride4' in the 334 'Convolutional Neural Networks for Small-footprint Keyword Spotting' paper: 335 http://www.isca-speech.org/archive/interspeech_2015/papers/i15_1478.pdf 336 337 Here's the layout of the graph: 338 339 (fingerprint_input) 340 v 341 [Conv2D]<-(weights) 342 v 343 [BiasAdd]<-(bias) 344 v 345 [Relu] 346 v 347 [MatMul]<-(weights) 348 v 349 [BiasAdd]<-(bias) 350 v 351 [MatMul]<-(weights) 352 v 353 [BiasAdd]<-(bias) 354 v 355 [MatMul]<-(weights) 356 v 357 [BiasAdd]<-(bias) 358 v 359 360 This produces slightly lower quality results than the 'conv' model, but needs 361 fewer weight parameters and computations. 362 363 During training, dropout nodes are introduced after the relu, controlled by a 364 placeholder. 365 366 Args: 367 fingerprint_input: TensorFlow node that will output audio feature vectors. 368 model_settings: Dictionary of information about the model. 369 is_training: Whether the model is going to be used for training. 370 371 Returns: 372 TensorFlow node outputting logits results, and optionally a dropout 373 placeholder. 374 """ 375 if is_training: 376 dropout_rate = tf.compat.v1.placeholder(tf.float32, name='dropout_rate') 377 input_frequency_size = model_settings['fingerprint_width'] 378 input_time_size = model_settings['spectrogram_length'] 379 fingerprint_4d = tf.reshape(fingerprint_input, 380 [-1, input_time_size, input_frequency_size, 1]) 381 first_filter_width = 8 382 first_filter_height = input_time_size 383 first_filter_count = 186 384 first_filter_stride_x = 1 385 first_filter_stride_y = 1 386 first_weights = tf.compat.v1.get_variable( 387 name='first_weights', 388 initializer=tf.compat.v1.truncated_normal_initializer(stddev=0.01), 389 shape=[first_filter_height, first_filter_width, 1, first_filter_count]) 390 first_bias = tf.compat.v1.get_variable( 391 name='first_bias', 392 initializer=tf.compat.v1.zeros_initializer, 393 shape=[first_filter_count]) 394 first_conv = tf.nn.conv2d( 395 input=fingerprint_4d, 396 filters=first_weights, 397 strides=[1, first_filter_stride_y, first_filter_stride_x, 1], 398 padding='VALID') + first_bias 399 first_relu = tf.nn.relu(first_conv) 400 if is_training: 401 first_dropout = tf.nn.dropout(first_relu, rate=dropout_rate) 402 else: 403 first_dropout = first_relu 404 first_conv_output_width = math.floor( 405 (input_frequency_size - first_filter_width + first_filter_stride_x) / 406 first_filter_stride_x) 407 first_conv_output_height = math.floor( 408 (input_time_size - first_filter_height + first_filter_stride_y) / 409 first_filter_stride_y) 410 first_conv_element_count = int( 411 first_conv_output_width * first_conv_output_height * first_filter_count) 412 flattened_first_conv = tf.reshape(first_dropout, 413 [-1, first_conv_element_count]) 414 first_fc_output_channels = 128 415 first_fc_weights = tf.compat.v1.get_variable( 416 name='first_fc_weights', 417 initializer=tf.compat.v1.truncated_normal_initializer(stddev=0.01), 418 shape=[first_conv_element_count, first_fc_output_channels]) 419 first_fc_bias = tf.compat.v1.get_variable( 420 name='first_fc_bias', 421 initializer=tf.compat.v1.zeros_initializer, 422 shape=[first_fc_output_channels]) 423 first_fc = tf.matmul(flattened_first_conv, first_fc_weights) + first_fc_bias 424 if is_training: 425 second_fc_input = tf.nn.dropout(first_fc, rate=dropout_rate) 426 else: 427 second_fc_input = first_fc 428 second_fc_output_channels = 128 429 second_fc_weights = tf.compat.v1.get_variable( 430 name='second_fc_weights', 431 initializer=tf.compat.v1.truncated_normal_initializer(stddev=0.01), 432 shape=[first_fc_output_channels, second_fc_output_channels]) 433 second_fc_bias = tf.compat.v1.get_variable( 434 name='second_fc_bias', 435 initializer=tf.compat.v1.zeros_initializer, 436 shape=[second_fc_output_channels]) 437 second_fc = tf.matmul(second_fc_input, second_fc_weights) + second_fc_bias 438 if is_training: 439 final_fc_input = tf.nn.dropout(second_fc, rate=dropout_rate) 440 else: 441 final_fc_input = second_fc 442 label_count = model_settings['label_count'] 443 final_fc_weights = tf.compat.v1.get_variable( 444 name='final_fc_weights', 445 initializer=tf.compat.v1.truncated_normal_initializer(stddev=0.01), 446 shape=[second_fc_output_channels, label_count]) 447 final_fc_bias = tf.compat.v1.get_variable( 448 name='final_fc_bias', 449 initializer=tf.compat.v1.zeros_initializer, 450 shape=[label_count]) 451 final_fc = tf.matmul(final_fc_input, final_fc_weights) + final_fc_bias 452 if is_training: 453 return final_fc, dropout_rate 454 else: 455 return final_fc 456 457 458def create_low_latency_svdf_model(fingerprint_input, model_settings, 459 is_training, runtime_settings): 460 """Builds an SVDF model with low compute requirements. 461 462 This is based in the topology presented in the 'Compressing Deep Neural 463 Networks using a Rank-Constrained Topology' paper: 464 https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/43813.pdf 465 466 Here's the layout of the graph: 467 468 (fingerprint_input) 469 v 470 [SVDF]<-(weights) 471 v 472 [BiasAdd]<-(bias) 473 v 474 [Relu] 475 v 476 [MatMul]<-(weights) 477 v 478 [BiasAdd]<-(bias) 479 v 480 [MatMul]<-(weights) 481 v 482 [BiasAdd]<-(bias) 483 v 484 [MatMul]<-(weights) 485 v 486 [BiasAdd]<-(bias) 487 v 488 489 This model produces lower recognition accuracy than the 'conv' model above, 490 but requires fewer weight parameters and, significantly fewer computations. 491 492 During training, dropout nodes are introduced after the relu, controlled by a 493 placeholder. 494 495 Args: 496 fingerprint_input: TensorFlow node that will output audio feature vectors. 497 The node is expected to produce a 2D Tensor of shape: 498 [batch, model_settings['fingerprint_width'] * 499 model_settings['spectrogram_length']] 500 with the features corresponding to the same time slot arranged contiguously, 501 and the oldest slot at index [:, 0], and newest at [:, -1]. 502 model_settings: Dictionary of information about the model. 503 is_training: Whether the model is going to be used for training. 504 runtime_settings: Dictionary of information about the runtime. 505 506 Returns: 507 TensorFlow node outputting logits results, and optionally a dropout 508 placeholder. 509 510 Raises: 511 ValueError: If the inputs tensor is incorrectly shaped. 512 """ 513 if is_training: 514 dropout_rate = tf.compat.v1.placeholder(tf.float32, name='dropout_rate') 515 516 input_frequency_size = model_settings['fingerprint_width'] 517 input_time_size = model_settings['spectrogram_length'] 518 519 # Validation. 520 input_shape = fingerprint_input.get_shape() 521 if len(input_shape) != 2: 522 raise ValueError('Inputs to `SVDF` should have rank == 2.') 523 if input_shape[-1].value is None: 524 raise ValueError('The last dimension of the input to `SVDF` ' 525 'should be defined. Found `None`.') 526 if input_shape[-1].value % input_frequency_size != 0: 527 raise ValueError('The last dimension of the input to `SVDF` = {0} must be ' 528 'a multiple of the frame size = {1}'.format( 529 input_shape.shape[-1].value, input_frequency_size)) 530 531 # Set number of units (i.e. nodes) and rank. 532 rank = 2 533 num_units = 1280 534 # Number of filters: pairs of feature and time filters. 535 num_filters = rank * num_units 536 # Create the runtime memory: [num_filters, batch, input_time_size] 537 batch = 1 538 memory = tf.compat.v1.get_variable( 539 initializer=tf.compat.v1.zeros_initializer, 540 shape=[num_filters, batch, input_time_size], 541 trainable=False, 542 name='runtime-memory') 543 first_time_flag = tf.compat.v1.get_variable( 544 name='first_time_flag', dtype=tf.int32, initializer=1) 545 # Determine the number of new frames in the input, such that we only operate 546 # on those. For training we do not use the memory, and thus use all frames 547 # provided in the input. 548 # new_fingerprint_input: [batch, num_new_frames*input_frequency_size] 549 if is_training: 550 num_new_frames = input_time_size 551 else: 552 window_stride_ms = int(model_settings['window_stride_samples'] * 1000 / 553 model_settings['sample_rate']) 554 num_new_frames = tf.cond( 555 pred=tf.equal(first_time_flag, 1), 556 true_fn=lambda: input_time_size, 557 false_fn=lambda: int(runtime_settings['clip_stride_ms'] / window_stride_ms)) # pylint:disable=line-too-long 558 first_time_flag = 0 559 new_fingerprint_input = fingerprint_input[ 560 :, -num_new_frames*input_frequency_size:] 561 # Expand to add input channels dimension. 562 new_fingerprint_input = tf.expand_dims(new_fingerprint_input, 2) 563 564 # Create the frequency filters. 565 weights_frequency = tf.compat.v1.get_variable( 566 name='weights_frequency', 567 initializer=tf.compat.v1.truncated_normal_initializer(stddev=0.01), 568 shape=[input_frequency_size, num_filters]) 569 # Expand to add input channels dimensions. 570 # weights_frequency: [input_frequency_size, 1, num_filters] 571 weights_frequency = tf.expand_dims(weights_frequency, 1) 572 # Convolve the 1D feature filters sliding over the time dimension. 573 # activations_time: [batch, num_new_frames, num_filters] 574 activations_time = tf.nn.conv1d(input=new_fingerprint_input, 575 filters=weights_frequency, 576 stride=input_frequency_size, 577 padding='VALID') 578 # Rearrange such that we can perform the batched matmul. 579 # activations_time: [num_filters, batch, num_new_frames] 580 activations_time = tf.transpose(a=activations_time, perm=[2, 0, 1]) 581 582 # Runtime memory optimization. 583 if not is_training: 584 # We need to drop the activations corresponding to the oldest frames, and 585 # then add those corresponding to the new frames. 586 new_memory = memory[:, :, num_new_frames:] 587 new_memory = tf.concat([new_memory, activations_time], 2) 588 tf.compat.v1.assign(memory, new_memory) 589 activations_time = new_memory 590 591 # Create the time filters. 592 weights_time = tf.compat.v1.get_variable( 593 name='weights_time', 594 initializer=tf.compat.v1.truncated_normal_initializer(stddev=0.01), 595 shape=[num_filters, input_time_size]) 596 # Apply the time filter on the outputs of the feature filters. 597 # weights_time: [num_filters, input_time_size, 1] 598 # outputs: [num_filters, batch, 1] 599 weights_time = tf.expand_dims(weights_time, 2) 600 outputs = tf.matmul(activations_time, weights_time) 601 # Split num_units and rank into separate dimensions (the remaining 602 # dimension is the input_shape[0] -i.e. batch size). This also squeezes 603 # the last dimension, since it's not used. 604 # [num_filters, batch, 1] => [num_units, rank, batch] 605 outputs = tf.reshape(outputs, [num_units, rank, -1]) 606 # Sum the rank outputs per unit => [num_units, batch]. 607 units_output = tf.reduce_sum(input_tensor=outputs, axis=1) 608 # Transpose to shape [batch, num_units] 609 units_output = tf.transpose(a=units_output) 610 611 # Apply bias. 612 bias = tf.compat.v1.get_variable(name='bias', 613 initializer=tf.compat.v1.zeros_initializer, 614 shape=[num_units]) 615 first_bias = tf.nn.bias_add(units_output, bias) 616 617 # Relu. 618 first_relu = tf.nn.relu(first_bias) 619 620 if is_training: 621 first_dropout = tf.nn.dropout(first_relu, rate=dropout_rate) 622 else: 623 first_dropout = first_relu 624 625 first_fc_output_channels = 256 626 first_fc_weights = tf.compat.v1.get_variable( 627 name='first_fc_weights', 628 initializer=tf.compat.v1.truncated_normal_initializer(stddev=0.01), 629 shape=[num_units, first_fc_output_channels]) 630 first_fc_bias = tf.compat.v1.get_variable( 631 name='first_fc_bias', 632 initializer=tf.compat.v1.zeros_initializer, 633 shape=[first_fc_output_channels]) 634 first_fc = tf.matmul(first_dropout, first_fc_weights) + first_fc_bias 635 if is_training: 636 second_fc_input = tf.nn.dropout(first_fc, rate=dropout_rate) 637 else: 638 second_fc_input = first_fc 639 second_fc_output_channels = 256 640 second_fc_weights = tf.compat.v1.get_variable( 641 name='second_fc_weights', 642 initializer=tf.compat.v1.truncated_normal_initializer(stddev=0.01), 643 shape=[first_fc_output_channels, second_fc_output_channels]) 644 second_fc_bias = tf.compat.v1.get_variable( 645 name='second_fc_bias', 646 initializer=tf.compat.v1.zeros_initializer, 647 shape=[second_fc_output_channels]) 648 second_fc = tf.matmul(second_fc_input, second_fc_weights) + second_fc_bias 649 if is_training: 650 final_fc_input = tf.nn.dropout(second_fc, rate=dropout_rate) 651 else: 652 final_fc_input = second_fc 653 label_count = model_settings['label_count'] 654 final_fc_weights = tf.compat.v1.get_variable( 655 name='final_fc_weights', 656 initializer=tf.compat.v1.truncated_normal_initializer(stddev=0.01), 657 shape=[second_fc_output_channels, label_count]) 658 final_fc_bias = tf.compat.v1.get_variable( 659 name='final_fc_bias', 660 initializer=tf.compat.v1.zeros_initializer, 661 shape=[label_count]) 662 final_fc = tf.matmul(final_fc_input, final_fc_weights) + final_fc_bias 663 if is_training: 664 return final_fc, dropout_rate 665 else: 666 return final_fc 667 668 669def create_tiny_conv_model(fingerprint_input, model_settings, is_training): 670 """Builds a convolutional model aimed at microcontrollers. 671 672 Devices like DSPs and microcontrollers can have very small amounts of 673 memory and limited processing power. This model is designed to use less 674 than 20KB of working RAM, and fit within 32KB of read-only (flash) memory. 675 676 Here's the layout of the graph: 677 678 (fingerprint_input) 679 v 680 [Conv2D]<-(weights) 681 v 682 [BiasAdd]<-(bias) 683 v 684 [Relu] 685 v 686 [MatMul]<-(weights) 687 v 688 [BiasAdd]<-(bias) 689 v 690 691 This doesn't produce particularly accurate results, but it's designed to be 692 used as the first stage of a pipeline, running on a low-energy piece of 693 hardware that can always be on, and then wake higher-power chips when a 694 possible utterance has been found, so that more accurate analysis can be done. 695 696 During training, a dropout node is introduced after the relu, controlled by a 697 placeholder. 698 699 Args: 700 fingerprint_input: TensorFlow node that will output audio feature vectors. 701 model_settings: Dictionary of information about the model. 702 is_training: Whether the model is going to be used for training. 703 704 Returns: 705 TensorFlow node outputting logits results, and optionally a dropout 706 placeholder. 707 """ 708 if is_training: 709 dropout_rate = tf.compat.v1.placeholder(tf.float32, name='dropout_rate') 710 input_frequency_size = model_settings['fingerprint_width'] 711 input_time_size = model_settings['spectrogram_length'] 712 fingerprint_4d = tf.reshape(fingerprint_input, 713 [-1, input_time_size, input_frequency_size, 1]) 714 first_filter_width = 8 715 first_filter_height = 10 716 first_filter_count = 8 717 first_weights = tf.compat.v1.get_variable( 718 name='first_weights', 719 initializer=tf.compat.v1.truncated_normal_initializer(stddev=0.01), 720 shape=[first_filter_height, first_filter_width, 1, first_filter_count]) 721 first_bias = tf.compat.v1.get_variable( 722 name='first_bias', 723 initializer=tf.compat.v1.zeros_initializer, 724 shape=[first_filter_count]) 725 first_conv_stride_x = 2 726 first_conv_stride_y = 2 727 first_conv = tf.nn.conv2d( 728 input=fingerprint_4d, filters=first_weights, 729 strides=[1, first_conv_stride_y, first_conv_stride_x, 1], 730 padding='SAME') + first_bias 731 first_relu = tf.nn.relu(first_conv) 732 if is_training: 733 first_dropout = tf.nn.dropout(first_relu, rate=dropout_rate) 734 else: 735 first_dropout = first_relu 736 first_dropout_shape = first_dropout.get_shape() 737 first_dropout_output_width = first_dropout_shape[2] 738 first_dropout_output_height = first_dropout_shape[1] 739 first_dropout_element_count = int( 740 first_dropout_output_width * first_dropout_output_height * 741 first_filter_count) 742 flattened_first_dropout = tf.reshape(first_dropout, 743 [-1, first_dropout_element_count]) 744 label_count = model_settings['label_count'] 745 final_fc_weights = tf.compat.v1.get_variable( 746 name='final_fc_weights', 747 initializer=tf.compat.v1.truncated_normal_initializer(stddev=0.01), 748 shape=[first_dropout_element_count, label_count]) 749 final_fc_bias = tf.compat.v1.get_variable( 750 name='final_fc_bias', 751 initializer=tf.compat.v1.zeros_initializer, 752 shape=[label_count]) 753 final_fc = ( 754 tf.matmul(flattened_first_dropout, final_fc_weights) + final_fc_bias) 755 if is_training: 756 return final_fc, dropout_rate 757 else: 758 return final_fc 759 760 761def create_tiny_embedding_conv_model(fingerprint_input, model_settings, 762 is_training): 763 """Builds a convolutional model aimed at microcontrollers. 764 765 Devices like DSPs and microcontrollers can have very small amounts of 766 memory and limited processing power. This model is designed to use less 767 than 20KB of working RAM, and fit within 32KB of read-only (flash) memory. 768 769 Here's the layout of the graph: 770 771 (fingerprint_input) 772 v 773 [Conv2D]<-(weights) 774 v 775 [BiasAdd]<-(bias) 776 v 777 [Relu] 778 v 779 [Conv2D]<-(weights) 780 v 781 [BiasAdd]<-(bias) 782 v 783 [Relu] 784 v 785 [Conv2D]<-(weights) 786 v 787 [BiasAdd]<-(bias) 788 v 789 [Relu] 790 v 791 [MatMul]<-(weights) 792 v 793 [BiasAdd]<-(bias) 794 v 795 796 This doesn't produce particularly accurate results, but it's designed to be 797 used as the first stage of a pipeline, running on a low-energy piece of 798 hardware that can always be on, and then wake higher-power chips when a 799 possible utterance has been found, so that more accurate analysis can be done. 800 801 During training, a dropout node is introduced after the relu, controlled by a 802 placeholder. 803 804 Args: 805 fingerprint_input: TensorFlow node that will output audio feature vectors. 806 model_settings: Dictionary of information about the model. 807 is_training: Whether the model is going to be used for training. 808 809 Returns: 810 TensorFlow node outputting logits results, and optionally a dropout 811 placeholder. 812 """ 813 if is_training: 814 dropout_rate = tf.compat.v1.placeholder(tf.float32, name='dropout_rate') 815 input_frequency_size = model_settings['fingerprint_width'] 816 input_time_size = model_settings['spectrogram_length'] 817 fingerprint_4d = tf.reshape(fingerprint_input, 818 [-1, input_time_size, input_frequency_size, 1]) 819 820 first_filter_width = 8 821 first_filter_height = 10 822 first_filter_count = 8 823 first_weights = tf.compat.v1.get_variable( 824 name='first_weights', 825 initializer=tf.compat.v1.truncated_normal_initializer(stddev=0.01), 826 shape=[first_filter_height, first_filter_width, 1, first_filter_count]) 827 first_bias = tf.compat.v1.get_variable( 828 name='first_bias', 829 initializer=tf.compat.v1.zeros_initializer, 830 shape=[first_filter_count]) 831 first_conv_stride_x = 2 832 first_conv_stride_y = 2 833 834 first_conv = tf.nn.conv2d( 835 input=fingerprint_4d, filters=first_weights, 836 strides=[1, first_conv_stride_y, first_conv_stride_x, 1], 837 padding='SAME') + first_bias 838 first_relu = tf.nn.relu(first_conv) 839 if is_training: 840 first_dropout = tf.nn.dropout(first_relu, rate=dropout_rate) 841 842 else: 843 first_dropout = first_relu 844 845 second_filter_width = 8 846 second_filter_height = 10 847 second_filter_count = 8 848 second_weights = tf.compat.v1.get_variable( 849 name='second_weights', 850 initializer=tf.compat.v1.truncated_normal_initializer(stddev=0.01), 851 shape=[ 852 second_filter_height, second_filter_width, first_filter_count, 853 second_filter_count 854 ]) 855 second_bias = tf.compat.v1.get_variable( 856 name='second_bias', 857 initializer=tf.compat.v1.zeros_initializer, 858 shape=[second_filter_count]) 859 second_conv_stride_x = 8 860 second_conv_stride_y = 8 861 second_conv = tf.nn.conv2d( 862 input=first_dropout, filters=second_weights, 863 strides=[1, second_conv_stride_y, second_conv_stride_x, 1], 864 padding='SAME') + second_bias 865 second_relu = tf.nn.relu(second_conv) 866 if is_training: 867 second_dropout = tf.nn.dropout(second_relu, rate=dropout_rate) 868 else: 869 second_dropout = second_relu 870 871 second_dropout_shape = second_dropout.get_shape() 872 second_dropout_output_width = second_dropout_shape[2] 873 second_dropout_output_height = second_dropout_shape[1] 874 second_dropout_element_count = int(second_dropout_output_width * 875 second_dropout_output_height * 876 second_filter_count) 877 flattened_second_dropout = tf.reshape(second_dropout, 878 [-1, second_dropout_element_count]) 879 label_count = model_settings['label_count'] 880 final_fc_weights = tf.compat.v1.get_variable( 881 name='final_fc_weights', 882 initializer=tf.compat.v1.truncated_normal_initializer(stddev=0.01), 883 shape=[second_dropout_element_count, label_count]) 884 final_fc_bias = tf.compat.v1.get_variable( 885 name='final_fc_bias', 886 initializer=tf.compat.v1.zeros_initializer, 887 shape=[label_count]) 888 final_fc = ( 889 tf.matmul(flattened_second_dropout, final_fc_weights) + final_fc_bias) 890 if is_training: 891 return final_fc, dropout_rate 892 else: 893 return final_fc 894