• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7#     http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14# ==============================================================================
15"""Model definitions for simple speech recognition.
16
17"""
18import math
19
20import tensorflow as tf
21
22
23def _next_power_of_two(x):
24  """Calculates the smallest enclosing power of two for an input.
25
26  Args:
27    x: Positive float or integer number.
28
29  Returns:
30    Next largest power of two integer.
31  """
32  return 1 if x == 0 else 2**(int(x) - 1).bit_length()
33
34
35def prepare_model_settings(label_count, sample_rate, clip_duration_ms,
36                           window_size_ms, window_stride_ms, feature_bin_count,
37                           preprocess):
38  """Calculates common settings needed for all models.
39
40  Args:
41    label_count: How many classes are to be recognized.
42    sample_rate: Number of audio samples per second.
43    clip_duration_ms: Length of each audio clip to be analyzed.
44    window_size_ms: Duration of frequency analysis window.
45    window_stride_ms: How far to move in time between frequency windows.
46    feature_bin_count: Number of frequency bins to use for analysis.
47    preprocess: How the spectrogram is processed to produce features.
48
49  Returns:
50    Dictionary containing common settings.
51
52  Raises:
53    ValueError: If the preprocessing mode isn't recognized.
54  """
55  desired_samples = int(sample_rate * clip_duration_ms / 1000)
56  window_size_samples = int(sample_rate * window_size_ms / 1000)
57  window_stride_samples = int(sample_rate * window_stride_ms / 1000)
58  length_minus_window = (desired_samples - window_size_samples)
59  if length_minus_window < 0:
60    spectrogram_length = 0
61  else:
62    spectrogram_length = 1 + int(length_minus_window / window_stride_samples)
63  if preprocess == 'average':
64    fft_bin_count = 1 + (_next_power_of_two(window_size_samples) / 2)
65    average_window_width = int(math.floor(fft_bin_count / feature_bin_count))
66    fingerprint_width = int(math.ceil(fft_bin_count / average_window_width))
67  elif preprocess == 'mfcc':
68    average_window_width = -1
69    fingerprint_width = feature_bin_count
70  elif preprocess == 'micro':
71    average_window_width = -1
72    fingerprint_width = feature_bin_count
73  else:
74    raise ValueError('Unknown preprocess mode "%s" (should be "mfcc",'
75                     ' "average", or "micro")' % (preprocess))
76  fingerprint_size = fingerprint_width * spectrogram_length
77  return {
78      'desired_samples': desired_samples,
79      'window_size_samples': window_size_samples,
80      'window_stride_samples': window_stride_samples,
81      'spectrogram_length': spectrogram_length,
82      'fingerprint_width': fingerprint_width,
83      'fingerprint_size': fingerprint_size,
84      'label_count': label_count,
85      'sample_rate': sample_rate,
86      'preprocess': preprocess,
87      'average_window_width': average_window_width,
88  }
89
90
91def create_model(fingerprint_input, model_settings, model_architecture,
92                 is_training, runtime_settings=None):
93  """Builds a model of the requested architecture compatible with the settings.
94
95  There are many possible ways of deriving predictions from a spectrogram
96  input, so this function provides an abstract interface for creating different
97  kinds of models in a black-box way. You need to pass in a TensorFlow node as
98  the 'fingerprint' input, and this should output a batch of 1D features that
99  describe the audio. Typically this will be derived from a spectrogram that's
100  been run through an MFCC, but in theory it can be any feature vector of the
101  size specified in model_settings['fingerprint_size'].
102
103  The function will build the graph it needs in the current TensorFlow graph,
104  and return the tensorflow output that will contain the 'logits' input to the
105  softmax prediction process. If training flag is on, it will also return a
106  placeholder node that can be used to control the dropout amount.
107
108  See the implementations below for the possible model architectures that can be
109  requested.
110
111  Args:
112    fingerprint_input: TensorFlow node that will output audio feature vectors.
113    model_settings: Dictionary of information about the model.
114    model_architecture: String specifying which kind of model to create.
115    is_training: Whether the model is going to be used for training.
116    runtime_settings: Dictionary of information about the runtime.
117
118  Returns:
119    TensorFlow node outputting logits results, and optionally a dropout
120    placeholder.
121
122  Raises:
123    Exception: If the architecture type isn't recognized.
124  """
125  if model_architecture == 'single_fc':
126    return create_single_fc_model(fingerprint_input, model_settings,
127                                  is_training)
128  elif model_architecture == 'conv':
129    return create_conv_model(fingerprint_input, model_settings, is_training)
130  elif model_architecture == 'low_latency_conv':
131    return create_low_latency_conv_model(fingerprint_input, model_settings,
132                                         is_training)
133  elif model_architecture == 'low_latency_svdf':
134    return create_low_latency_svdf_model(fingerprint_input, model_settings,
135                                         is_training, runtime_settings)
136  elif model_architecture == 'tiny_conv':
137    return create_tiny_conv_model(fingerprint_input, model_settings,
138                                  is_training)
139  elif model_architecture == 'tiny_embedding_conv':
140    return create_tiny_embedding_conv_model(fingerprint_input, model_settings,
141                                            is_training)
142  else:
143    raise Exception('model_architecture argument "' + model_architecture +
144                    '" not recognized, should be one of "single_fc", "conv",' +
145                    ' "low_latency_conv, "low_latency_svdf",' +
146                    ' "tiny_conv", or "tiny_embedding_conv"')
147
148
149def load_variables_from_checkpoint(sess, start_checkpoint):
150  """Utility function to centralize checkpoint restoration.
151
152  Args:
153    sess: TensorFlow session.
154    start_checkpoint: Path to saved checkpoint on disk.
155  """
156  saver = tf.compat.v1.train.Saver(tf.compat.v1.global_variables())
157  saver.restore(sess, start_checkpoint)
158
159
160def create_single_fc_model(fingerprint_input, model_settings, is_training):
161  """Builds a model with a single hidden fully-connected layer.
162
163  This is a very simple model with just one matmul and bias layer. As you'd
164  expect, it doesn't produce very accurate results, but it is very fast and
165  simple, so it's useful for sanity testing.
166
167  Here's the layout of the graph:
168
169  (fingerprint_input)
170          v
171      [MatMul]<-(weights)
172          v
173      [BiasAdd]<-(bias)
174          v
175
176  Args:
177    fingerprint_input: TensorFlow node that will output audio feature vectors.
178    model_settings: Dictionary of information about the model.
179    is_training: Whether the model is going to be used for training.
180
181  Returns:
182    TensorFlow node outputting logits results, and optionally a dropout
183    placeholder.
184  """
185  if is_training:
186    dropout_rate = tf.compat.v1.placeholder(tf.float32, name='dropout_rate')
187  fingerprint_size = model_settings['fingerprint_size']
188  label_count = model_settings['label_count']
189  weights = tf.compat.v1.get_variable(
190      name='weights',
191      initializer=tf.compat.v1.truncated_normal_initializer(stddev=0.001),
192      shape=[fingerprint_size, label_count])
193  bias = tf.compat.v1.get_variable(name='bias',
194                                   initializer=tf.compat.v1.zeros_initializer,
195                                   shape=[label_count])
196  logits = tf.matmul(fingerprint_input, weights) + bias
197  if is_training:
198    return logits, dropout_rate
199  else:
200    return logits
201
202
203def create_conv_model(fingerprint_input, model_settings, is_training):
204  """Builds a standard convolutional model.
205
206  This is roughly the network labeled as 'cnn-trad-fpool3' in the
207  'Convolutional Neural Networks for Small-footprint Keyword Spotting' paper:
208  http://www.isca-speech.org/archive/interspeech_2015/papers/i15_1478.pdf
209
210  Here's the layout of the graph:
211
212  (fingerprint_input)
213          v
214      [Conv2D]<-(weights)
215          v
216      [BiasAdd]<-(bias)
217          v
218        [Relu]
219          v
220      [MaxPool]
221          v
222      [Conv2D]<-(weights)
223          v
224      [BiasAdd]<-(bias)
225          v
226        [Relu]
227          v
228      [MaxPool]
229          v
230      [MatMul]<-(weights)
231          v
232      [BiasAdd]<-(bias)
233          v
234
235  This produces fairly good quality results, but can involve a large number of
236  weight parameters and computations. For a cheaper alternative from the same
237  paper with slightly less accuracy, see 'low_latency_conv' below.
238
239  During training, dropout nodes are introduced after each relu, controlled by a
240  placeholder.
241
242  Args:
243    fingerprint_input: TensorFlow node that will output audio feature vectors.
244    model_settings: Dictionary of information about the model.
245    is_training: Whether the model is going to be used for training.
246
247  Returns:
248    TensorFlow node outputting logits results, and optionally a dropout
249    placeholder.
250  """
251  if is_training:
252    dropout_rate = tf.compat.v1.placeholder(tf.float32, name='dropout_rate')
253  input_frequency_size = model_settings['fingerprint_width']
254  input_time_size = model_settings['spectrogram_length']
255  fingerprint_4d = tf.reshape(fingerprint_input,
256                              [-1, input_time_size, input_frequency_size, 1])
257  first_filter_width = 8
258  first_filter_height = 20
259  first_filter_count = 64
260  first_weights = tf.compat.v1.get_variable(
261      name='first_weights',
262      initializer=tf.compat.v1.truncated_normal_initializer(stddev=0.01),
263      shape=[first_filter_height, first_filter_width, 1, first_filter_count])
264  first_bias = tf.compat.v1.get_variable(
265      name='first_bias',
266      initializer=tf.compat.v1.zeros_initializer,
267      shape=[first_filter_count])
268
269  first_conv = tf.nn.conv2d(input=fingerprint_4d,
270                            filters=first_weights,
271                            strides=[1, 1, 1, 1],
272                            padding='SAME') + first_bias
273  first_relu = tf.nn.relu(first_conv)
274  if is_training:
275    first_dropout = tf.nn.dropout(first_relu, rate=dropout_rate)
276  else:
277    first_dropout = first_relu
278  max_pool = tf.nn.max_pool2d(input=first_dropout,
279                              ksize=[1, 2, 2, 1],
280                              strides=[1, 2, 2, 1],
281                              padding='SAME')
282  second_filter_width = 4
283  second_filter_height = 10
284  second_filter_count = 64
285  second_weights = tf.compat.v1.get_variable(
286      name='second_weights',
287      initializer=tf.compat.v1.truncated_normal_initializer(stddev=0.01),
288      shape=[
289          second_filter_height, second_filter_width, first_filter_count,
290          second_filter_count
291      ])
292  second_bias = tf.compat.v1.get_variable(
293      name='second_bias',
294      initializer=tf.compat.v1.zeros_initializer,
295      shape=[second_filter_count])
296  second_conv = tf.nn.conv2d(input=max_pool,
297                             filters=second_weights,
298                             strides=[1, 1, 1, 1],
299                             padding='SAME') + second_bias
300  second_relu = tf.nn.relu(second_conv)
301  if is_training:
302    second_dropout = tf.nn.dropout(second_relu, rate=dropout_rate)
303  else:
304    second_dropout = second_relu
305  second_conv_shape = second_dropout.get_shape()
306  second_conv_output_width = second_conv_shape[2]
307  second_conv_output_height = second_conv_shape[1]
308  second_conv_element_count = int(
309      second_conv_output_width * second_conv_output_height *
310      second_filter_count)
311  flattened_second_conv = tf.reshape(second_dropout,
312                                     [-1, second_conv_element_count])
313  label_count = model_settings['label_count']
314  final_fc_weights = tf.compat.v1.get_variable(
315      name='final_fc_weights',
316      initializer=tf.compat.v1.truncated_normal_initializer(stddev=0.01),
317      shape=[second_conv_element_count, label_count])
318  final_fc_bias = tf.compat.v1.get_variable(
319      name='final_fc_bias',
320      initializer=tf.compat.v1.zeros_initializer,
321      shape=[label_count])
322  final_fc = tf.matmul(flattened_second_conv, final_fc_weights) + final_fc_bias
323  if is_training:
324    return final_fc, dropout_rate
325  else:
326    return final_fc
327
328
329def create_low_latency_conv_model(fingerprint_input, model_settings,
330                                  is_training):
331  """Builds a convolutional model with low compute requirements.
332
333  This is roughly the network labeled as 'cnn-one-fstride4' in the
334  'Convolutional Neural Networks for Small-footprint Keyword Spotting' paper:
335  http://www.isca-speech.org/archive/interspeech_2015/papers/i15_1478.pdf
336
337  Here's the layout of the graph:
338
339  (fingerprint_input)
340          v
341      [Conv2D]<-(weights)
342          v
343      [BiasAdd]<-(bias)
344          v
345        [Relu]
346          v
347      [MatMul]<-(weights)
348          v
349      [BiasAdd]<-(bias)
350          v
351      [MatMul]<-(weights)
352          v
353      [BiasAdd]<-(bias)
354          v
355      [MatMul]<-(weights)
356          v
357      [BiasAdd]<-(bias)
358          v
359
360  This produces slightly lower quality results than the 'conv' model, but needs
361  fewer weight parameters and computations.
362
363  During training, dropout nodes are introduced after the relu, controlled by a
364  placeholder.
365
366  Args:
367    fingerprint_input: TensorFlow node that will output audio feature vectors.
368    model_settings: Dictionary of information about the model.
369    is_training: Whether the model is going to be used for training.
370
371  Returns:
372    TensorFlow node outputting logits results, and optionally a dropout
373    placeholder.
374  """
375  if is_training:
376    dropout_rate = tf.compat.v1.placeholder(tf.float32, name='dropout_rate')
377  input_frequency_size = model_settings['fingerprint_width']
378  input_time_size = model_settings['spectrogram_length']
379  fingerprint_4d = tf.reshape(fingerprint_input,
380                              [-1, input_time_size, input_frequency_size, 1])
381  first_filter_width = 8
382  first_filter_height = input_time_size
383  first_filter_count = 186
384  first_filter_stride_x = 1
385  first_filter_stride_y = 1
386  first_weights = tf.compat.v1.get_variable(
387      name='first_weights',
388      initializer=tf.compat.v1.truncated_normal_initializer(stddev=0.01),
389      shape=[first_filter_height, first_filter_width, 1, first_filter_count])
390  first_bias = tf.compat.v1.get_variable(
391      name='first_bias',
392      initializer=tf.compat.v1.zeros_initializer,
393      shape=[first_filter_count])
394  first_conv = tf.nn.conv2d(
395      input=fingerprint_4d,
396      filters=first_weights,
397      strides=[1, first_filter_stride_y, first_filter_stride_x, 1],
398      padding='VALID') + first_bias
399  first_relu = tf.nn.relu(first_conv)
400  if is_training:
401    first_dropout = tf.nn.dropout(first_relu, rate=dropout_rate)
402  else:
403    first_dropout = first_relu
404  first_conv_output_width = math.floor(
405      (input_frequency_size - first_filter_width + first_filter_stride_x) /
406      first_filter_stride_x)
407  first_conv_output_height = math.floor(
408      (input_time_size - first_filter_height + first_filter_stride_y) /
409      first_filter_stride_y)
410  first_conv_element_count = int(
411      first_conv_output_width * first_conv_output_height * first_filter_count)
412  flattened_first_conv = tf.reshape(first_dropout,
413                                    [-1, first_conv_element_count])
414  first_fc_output_channels = 128
415  first_fc_weights = tf.compat.v1.get_variable(
416      name='first_fc_weights',
417      initializer=tf.compat.v1.truncated_normal_initializer(stddev=0.01),
418      shape=[first_conv_element_count, first_fc_output_channels])
419  first_fc_bias = tf.compat.v1.get_variable(
420      name='first_fc_bias',
421      initializer=tf.compat.v1.zeros_initializer,
422      shape=[first_fc_output_channels])
423  first_fc = tf.matmul(flattened_first_conv, first_fc_weights) + first_fc_bias
424  if is_training:
425    second_fc_input = tf.nn.dropout(first_fc, rate=dropout_rate)
426  else:
427    second_fc_input = first_fc
428  second_fc_output_channels = 128
429  second_fc_weights = tf.compat.v1.get_variable(
430      name='second_fc_weights',
431      initializer=tf.compat.v1.truncated_normal_initializer(stddev=0.01),
432      shape=[first_fc_output_channels, second_fc_output_channels])
433  second_fc_bias = tf.compat.v1.get_variable(
434      name='second_fc_bias',
435      initializer=tf.compat.v1.zeros_initializer,
436      shape=[second_fc_output_channels])
437  second_fc = tf.matmul(second_fc_input, second_fc_weights) + second_fc_bias
438  if is_training:
439    final_fc_input = tf.nn.dropout(second_fc, rate=dropout_rate)
440  else:
441    final_fc_input = second_fc
442  label_count = model_settings['label_count']
443  final_fc_weights = tf.compat.v1.get_variable(
444      name='final_fc_weights',
445      initializer=tf.compat.v1.truncated_normal_initializer(stddev=0.01),
446      shape=[second_fc_output_channels, label_count])
447  final_fc_bias = tf.compat.v1.get_variable(
448      name='final_fc_bias',
449      initializer=tf.compat.v1.zeros_initializer,
450      shape=[label_count])
451  final_fc = tf.matmul(final_fc_input, final_fc_weights) + final_fc_bias
452  if is_training:
453    return final_fc, dropout_rate
454  else:
455    return final_fc
456
457
458def create_low_latency_svdf_model(fingerprint_input, model_settings,
459                                  is_training, runtime_settings):
460  """Builds an SVDF model with low compute requirements.
461
462  This is based in the topology presented in the 'Compressing Deep Neural
463  Networks using a Rank-Constrained Topology' paper:
464  https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/43813.pdf
465
466  Here's the layout of the graph:
467
468  (fingerprint_input)
469          v
470        [SVDF]<-(weights)
471          v
472      [BiasAdd]<-(bias)
473          v
474        [Relu]
475          v
476      [MatMul]<-(weights)
477          v
478      [BiasAdd]<-(bias)
479          v
480      [MatMul]<-(weights)
481          v
482      [BiasAdd]<-(bias)
483          v
484      [MatMul]<-(weights)
485          v
486      [BiasAdd]<-(bias)
487          v
488
489  This model produces lower recognition accuracy than the 'conv' model above,
490  but requires fewer weight parameters and, significantly fewer computations.
491
492  During training, dropout nodes are introduced after the relu, controlled by a
493  placeholder.
494
495  Args:
496    fingerprint_input: TensorFlow node that will output audio feature vectors.
497    The node is expected to produce a 2D Tensor of shape:
498      [batch, model_settings['fingerprint_width'] *
499              model_settings['spectrogram_length']]
500    with the features corresponding to the same time slot arranged contiguously,
501    and the oldest slot at index [:, 0], and newest at [:, -1].
502    model_settings: Dictionary of information about the model.
503    is_training: Whether the model is going to be used for training.
504    runtime_settings: Dictionary of information about the runtime.
505
506  Returns:
507    TensorFlow node outputting logits results, and optionally a dropout
508    placeholder.
509
510  Raises:
511      ValueError: If the inputs tensor is incorrectly shaped.
512  """
513  if is_training:
514    dropout_rate = tf.compat.v1.placeholder(tf.float32, name='dropout_rate')
515
516  input_frequency_size = model_settings['fingerprint_width']
517  input_time_size = model_settings['spectrogram_length']
518
519  # Validation.
520  input_shape = fingerprint_input.get_shape()
521  if len(input_shape) != 2:
522    raise ValueError('Inputs to `SVDF` should have rank == 2.')
523  if input_shape[-1].value is None:
524    raise ValueError('The last dimension of the input to `SVDF` '
525                     'should be defined. Found `None`.')
526  if input_shape[-1].value % input_frequency_size != 0:
527    raise ValueError('The last dimension of the input to `SVDF` = {0} must be '
528                     'a multiple of the frame size = {1}'.format(
529                         input_shape.shape[-1].value, input_frequency_size))
530
531  # Set number of units (i.e. nodes) and rank.
532  rank = 2
533  num_units = 1280
534  # Number of filters: pairs of feature and time filters.
535  num_filters = rank * num_units
536  # Create the runtime memory: [num_filters, batch, input_time_size]
537  batch = 1
538  memory = tf.compat.v1.get_variable(
539      initializer=tf.compat.v1.zeros_initializer,
540      shape=[num_filters, batch, input_time_size],
541      trainable=False,
542      name='runtime-memory')
543  first_time_flag = tf.compat.v1.get_variable(
544      name='first_time_flag', dtype=tf.int32, initializer=1)
545  # Determine the number of new frames in the input, such that we only operate
546  # on those. For training we do not use the memory, and thus use all frames
547  # provided in the input.
548  # new_fingerprint_input: [batch, num_new_frames*input_frequency_size]
549  if is_training:
550    num_new_frames = input_time_size
551  else:
552    window_stride_ms = int(model_settings['window_stride_samples'] * 1000 /
553                           model_settings['sample_rate'])
554    num_new_frames = tf.cond(
555        pred=tf.equal(first_time_flag, 1),
556        true_fn=lambda: input_time_size,
557        false_fn=lambda: int(runtime_settings['clip_stride_ms'] / window_stride_ms))  # pylint:disable=line-too-long
558  first_time_flag = 0
559  new_fingerprint_input = fingerprint_input[
560      :, -num_new_frames*input_frequency_size:]
561  # Expand to add input channels dimension.
562  new_fingerprint_input = tf.expand_dims(new_fingerprint_input, 2)
563
564  # Create the frequency filters.
565  weights_frequency = tf.compat.v1.get_variable(
566      name='weights_frequency',
567      initializer=tf.compat.v1.truncated_normal_initializer(stddev=0.01),
568      shape=[input_frequency_size, num_filters])
569  # Expand to add input channels dimensions.
570  # weights_frequency: [input_frequency_size, 1, num_filters]
571  weights_frequency = tf.expand_dims(weights_frequency, 1)
572  # Convolve the 1D feature filters sliding over the time dimension.
573  # activations_time: [batch, num_new_frames, num_filters]
574  activations_time = tf.nn.conv1d(input=new_fingerprint_input,
575                                  filters=weights_frequency,
576                                  stride=input_frequency_size,
577                                  padding='VALID')
578  # Rearrange such that we can perform the batched matmul.
579  # activations_time: [num_filters, batch, num_new_frames]
580  activations_time = tf.transpose(a=activations_time, perm=[2, 0, 1])
581
582  # Runtime memory optimization.
583  if not is_training:
584    # We need to drop the activations corresponding to the oldest frames, and
585    # then add those corresponding to the new frames.
586    new_memory = memory[:, :, num_new_frames:]
587    new_memory = tf.concat([new_memory, activations_time], 2)
588    tf.compat.v1.assign(memory, new_memory)
589    activations_time = new_memory
590
591  # Create the time filters.
592  weights_time = tf.compat.v1.get_variable(
593      name='weights_time',
594      initializer=tf.compat.v1.truncated_normal_initializer(stddev=0.01),
595      shape=[num_filters, input_time_size])
596  # Apply the time filter on the outputs of the feature filters.
597  # weights_time: [num_filters, input_time_size, 1]
598  # outputs: [num_filters, batch, 1]
599  weights_time = tf.expand_dims(weights_time, 2)
600  outputs = tf.matmul(activations_time, weights_time)
601  # Split num_units and rank into separate dimensions (the remaining
602  # dimension is the input_shape[0] -i.e. batch size). This also squeezes
603  # the last dimension, since it's not used.
604  # [num_filters, batch, 1] => [num_units, rank, batch]
605  outputs = tf.reshape(outputs, [num_units, rank, -1])
606  # Sum the rank outputs per unit => [num_units, batch].
607  units_output = tf.reduce_sum(input_tensor=outputs, axis=1)
608  # Transpose to shape [batch, num_units]
609  units_output = tf.transpose(a=units_output)
610
611  # Apply bias.
612  bias = tf.compat.v1.get_variable(name='bias',
613                                   initializer=tf.compat.v1.zeros_initializer,
614                                   shape=[num_units])
615  first_bias = tf.nn.bias_add(units_output, bias)
616
617  # Relu.
618  first_relu = tf.nn.relu(first_bias)
619
620  if is_training:
621    first_dropout = tf.nn.dropout(first_relu, rate=dropout_rate)
622  else:
623    first_dropout = first_relu
624
625  first_fc_output_channels = 256
626  first_fc_weights = tf.compat.v1.get_variable(
627      name='first_fc_weights',
628      initializer=tf.compat.v1.truncated_normal_initializer(stddev=0.01),
629      shape=[num_units, first_fc_output_channels])
630  first_fc_bias = tf.compat.v1.get_variable(
631      name='first_fc_bias',
632      initializer=tf.compat.v1.zeros_initializer,
633      shape=[first_fc_output_channels])
634  first_fc = tf.matmul(first_dropout, first_fc_weights) + first_fc_bias
635  if is_training:
636    second_fc_input = tf.nn.dropout(first_fc, rate=dropout_rate)
637  else:
638    second_fc_input = first_fc
639  second_fc_output_channels = 256
640  second_fc_weights = tf.compat.v1.get_variable(
641      name='second_fc_weights',
642      initializer=tf.compat.v1.truncated_normal_initializer(stddev=0.01),
643      shape=[first_fc_output_channels, second_fc_output_channels])
644  second_fc_bias = tf.compat.v1.get_variable(
645      name='second_fc_bias',
646      initializer=tf.compat.v1.zeros_initializer,
647      shape=[second_fc_output_channels])
648  second_fc = tf.matmul(second_fc_input, second_fc_weights) + second_fc_bias
649  if is_training:
650    final_fc_input = tf.nn.dropout(second_fc, rate=dropout_rate)
651  else:
652    final_fc_input = second_fc
653  label_count = model_settings['label_count']
654  final_fc_weights = tf.compat.v1.get_variable(
655      name='final_fc_weights',
656      initializer=tf.compat.v1.truncated_normal_initializer(stddev=0.01),
657      shape=[second_fc_output_channels, label_count])
658  final_fc_bias = tf.compat.v1.get_variable(
659      name='final_fc_bias',
660      initializer=tf.compat.v1.zeros_initializer,
661      shape=[label_count])
662  final_fc = tf.matmul(final_fc_input, final_fc_weights) + final_fc_bias
663  if is_training:
664    return final_fc, dropout_rate
665  else:
666    return final_fc
667
668
669def create_tiny_conv_model(fingerprint_input, model_settings, is_training):
670  """Builds a convolutional model aimed at microcontrollers.
671
672  Devices like DSPs and microcontrollers can have very small amounts of
673  memory and limited processing power. This model is designed to use less
674  than 20KB of working RAM, and fit within 32KB of read-only (flash) memory.
675
676  Here's the layout of the graph:
677
678  (fingerprint_input)
679          v
680      [Conv2D]<-(weights)
681          v
682      [BiasAdd]<-(bias)
683          v
684        [Relu]
685          v
686      [MatMul]<-(weights)
687          v
688      [BiasAdd]<-(bias)
689          v
690
691  This doesn't produce particularly accurate results, but it's designed to be
692  used as the first stage of a pipeline, running on a low-energy piece of
693  hardware that can always be on, and then wake higher-power chips when a
694  possible utterance has been found, so that more accurate analysis can be done.
695
696  During training, a dropout node is introduced after the relu, controlled by a
697  placeholder.
698
699  Args:
700    fingerprint_input: TensorFlow node that will output audio feature vectors.
701    model_settings: Dictionary of information about the model.
702    is_training: Whether the model is going to be used for training.
703
704  Returns:
705    TensorFlow node outputting logits results, and optionally a dropout
706    placeholder.
707  """
708  if is_training:
709    dropout_rate = tf.compat.v1.placeholder(tf.float32, name='dropout_rate')
710  input_frequency_size = model_settings['fingerprint_width']
711  input_time_size = model_settings['spectrogram_length']
712  fingerprint_4d = tf.reshape(fingerprint_input,
713                              [-1, input_time_size, input_frequency_size, 1])
714  first_filter_width = 8
715  first_filter_height = 10
716  first_filter_count = 8
717  first_weights = tf.compat.v1.get_variable(
718      name='first_weights',
719      initializer=tf.compat.v1.truncated_normal_initializer(stddev=0.01),
720      shape=[first_filter_height, first_filter_width, 1, first_filter_count])
721  first_bias = tf.compat.v1.get_variable(
722      name='first_bias',
723      initializer=tf.compat.v1.zeros_initializer,
724      shape=[first_filter_count])
725  first_conv_stride_x = 2
726  first_conv_stride_y = 2
727  first_conv = tf.nn.conv2d(
728      input=fingerprint_4d, filters=first_weights,
729      strides=[1, first_conv_stride_y, first_conv_stride_x, 1],
730      padding='SAME') + first_bias
731  first_relu = tf.nn.relu(first_conv)
732  if is_training:
733    first_dropout = tf.nn.dropout(first_relu, rate=dropout_rate)
734  else:
735    first_dropout = first_relu
736  first_dropout_shape = first_dropout.get_shape()
737  first_dropout_output_width = first_dropout_shape[2]
738  first_dropout_output_height = first_dropout_shape[1]
739  first_dropout_element_count = int(
740      first_dropout_output_width * first_dropout_output_height *
741      first_filter_count)
742  flattened_first_dropout = tf.reshape(first_dropout,
743                                       [-1, first_dropout_element_count])
744  label_count = model_settings['label_count']
745  final_fc_weights = tf.compat.v1.get_variable(
746      name='final_fc_weights',
747      initializer=tf.compat.v1.truncated_normal_initializer(stddev=0.01),
748      shape=[first_dropout_element_count, label_count])
749  final_fc_bias = tf.compat.v1.get_variable(
750      name='final_fc_bias',
751      initializer=tf.compat.v1.zeros_initializer,
752      shape=[label_count])
753  final_fc = (
754      tf.matmul(flattened_first_dropout, final_fc_weights) + final_fc_bias)
755  if is_training:
756    return final_fc, dropout_rate
757  else:
758    return final_fc
759
760
761def create_tiny_embedding_conv_model(fingerprint_input, model_settings,
762                                     is_training):
763  """Builds a convolutional model aimed at microcontrollers.
764
765  Devices like DSPs and microcontrollers can have very small amounts of
766  memory and limited processing power. This model is designed to use less
767  than 20KB of working RAM, and fit within 32KB of read-only (flash) memory.
768
769  Here's the layout of the graph:
770
771  (fingerprint_input)
772          v
773      [Conv2D]<-(weights)
774          v
775      [BiasAdd]<-(bias)
776          v
777        [Relu]
778          v
779      [Conv2D]<-(weights)
780          v
781      [BiasAdd]<-(bias)
782          v
783        [Relu]
784          v
785      [Conv2D]<-(weights)
786          v
787      [BiasAdd]<-(bias)
788          v
789        [Relu]
790          v
791      [MatMul]<-(weights)
792          v
793      [BiasAdd]<-(bias)
794          v
795
796  This doesn't produce particularly accurate results, but it's designed to be
797  used as the first stage of a pipeline, running on a low-energy piece of
798  hardware that can always be on, and then wake higher-power chips when a
799  possible utterance has been found, so that more accurate analysis can be done.
800
801  During training, a dropout node is introduced after the relu, controlled by a
802  placeholder.
803
804  Args:
805    fingerprint_input: TensorFlow node that will output audio feature vectors.
806    model_settings: Dictionary of information about the model.
807    is_training: Whether the model is going to be used for training.
808
809  Returns:
810    TensorFlow node outputting logits results, and optionally a dropout
811    placeholder.
812  """
813  if is_training:
814    dropout_rate = tf.compat.v1.placeholder(tf.float32, name='dropout_rate')
815  input_frequency_size = model_settings['fingerprint_width']
816  input_time_size = model_settings['spectrogram_length']
817  fingerprint_4d = tf.reshape(fingerprint_input,
818                              [-1, input_time_size, input_frequency_size, 1])
819
820  first_filter_width = 8
821  first_filter_height = 10
822  first_filter_count = 8
823  first_weights = tf.compat.v1.get_variable(
824      name='first_weights',
825      initializer=tf.compat.v1.truncated_normal_initializer(stddev=0.01),
826      shape=[first_filter_height, first_filter_width, 1, first_filter_count])
827  first_bias = tf.compat.v1.get_variable(
828      name='first_bias',
829      initializer=tf.compat.v1.zeros_initializer,
830      shape=[first_filter_count])
831  first_conv_stride_x = 2
832  first_conv_stride_y = 2
833
834  first_conv = tf.nn.conv2d(
835      input=fingerprint_4d, filters=first_weights,
836      strides=[1, first_conv_stride_y, first_conv_stride_x, 1],
837      padding='SAME') + first_bias
838  first_relu = tf.nn.relu(first_conv)
839  if is_training:
840    first_dropout = tf.nn.dropout(first_relu, rate=dropout_rate)
841
842  else:
843    first_dropout = first_relu
844
845  second_filter_width = 8
846  second_filter_height = 10
847  second_filter_count = 8
848  second_weights = tf.compat.v1.get_variable(
849      name='second_weights',
850      initializer=tf.compat.v1.truncated_normal_initializer(stddev=0.01),
851      shape=[
852          second_filter_height, second_filter_width, first_filter_count,
853          second_filter_count
854      ])
855  second_bias = tf.compat.v1.get_variable(
856      name='second_bias',
857      initializer=tf.compat.v1.zeros_initializer,
858      shape=[second_filter_count])
859  second_conv_stride_x = 8
860  second_conv_stride_y = 8
861  second_conv = tf.nn.conv2d(
862      input=first_dropout, filters=second_weights,
863      strides=[1, second_conv_stride_y, second_conv_stride_x, 1],
864      padding='SAME') + second_bias
865  second_relu = tf.nn.relu(second_conv)
866  if is_training:
867    second_dropout = tf.nn.dropout(second_relu, rate=dropout_rate)
868  else:
869    second_dropout = second_relu
870
871  second_dropout_shape = second_dropout.get_shape()
872  second_dropout_output_width = second_dropout_shape[2]
873  second_dropout_output_height = second_dropout_shape[1]
874  second_dropout_element_count = int(second_dropout_output_width *
875                                     second_dropout_output_height *
876                                     second_filter_count)
877  flattened_second_dropout = tf.reshape(second_dropout,
878                                        [-1, second_dropout_element_count])
879  label_count = model_settings['label_count']
880  final_fc_weights = tf.compat.v1.get_variable(
881      name='final_fc_weights',
882      initializer=tf.compat.v1.truncated_normal_initializer(stddev=0.01),
883      shape=[second_dropout_element_count, label_count])
884  final_fc_bias = tf.compat.v1.get_variable(
885      name='final_fc_bias',
886      initializer=tf.compat.v1.zeros_initializer,
887      shape=[label_count])
888  final_fc = (
889      tf.matmul(flattened_second_dropout, final_fc_weights) + final_fc_bias)
890  if is_training:
891    return final_fc, dropout_rate
892  else:
893    return final_fc
894