1# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7# http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14# ==============================================================================
15"""AudioMicrofrontend Op creates filterbanks from audio data."""
16
17from tensorflow.lite.experimental.microfrontend.ops import gen_audio_microfrontend_op
18from tensorflow.python.framework import dtypes
19from tensorflow.python.framework import load_library
20from tensorflow.python.framework import ops
21from tensorflow.python.ops import array_ops
22from tensorflow.python.platform import resource_loader
23from tensorflow.python.util.tf_export import tf_export
24
25_audio_microfrontend_op = load_library.load_op_library(
26 resource_loader.get_path_to_datafile("_audio_microfrontend_op.so"))
27
28
29@tf_export("lite.experimental.microfrontend.python.ops.audio_microfrontend")
30def audio_microfrontend(audio,
31 sample_rate=16000,
32 window_size=25,
33 window_step=10,
34 num_channels=32,
35 upper_band_limit=7500.0,
36 lower_band_limit=125.0,
37 smoothing_bits=10,
38 even_smoothing=0.025,
39 odd_smoothing=0.06,
40 min_signal_remaining=0.05,
41 enable_pcan=True,
42 pcan_strength=0.95,
43 pcan_offset=80.0,
44 gain_bits=21,
45 enable_log=True,
46 scale_shift=6,
47 left_context=0,
48 right_context=0,
49 frame_stride=1,
50 zero_padding=False,
51 out_scale=1,
52 out_type=dtypes.uint16):
53 """Audio Microfrontend Op.
54
55 This Op converts a sequence of audio data into one or more
56 feature vectors containing filterbanks of the input. The
57 conversion process uses a lightweight library to perform:
58
59 1. A slicing window function
60 2. Short-time FFTs
61 3. Filterbank calculations
62 4. Noise reduction
63 5. PCAN Auto Gain Control
64 6. Logarithmic scaling
65
66 Args:
67 audio: 1D Tensor, int16 audio data in temporal ordering.
68 sample_rate: Integer, the sample rate of the audio in Hz.
69 window_size: Integer, length of desired time frames in ms.
70 window_step: Integer, length of step size for the next frame in ms.
71 num_channels: Integer, the number of filterbank channels to use.
72 upper_band_limit: Float, the highest frequency included in the filterbanks.
73 lower_band_limit: Float, the lowest frequency included in the filterbanks.
74 smoothing_bits: Int, scale up signal by 2^(smoothing_bits) before reduction.
75 even_smoothing: Float, smoothing coefficient for even-numbered channels.
76 odd_smoothing: Float, smoothing coefficient for odd-numbered channels.
77 min_signal_remaining: Float, fraction of signal to preserve in smoothing.
78 enable_pcan: Bool, enable PCAN auto gain control.
79 pcan_strength: Float, gain normalization exponent.
80 pcan_offset: Float, positive value added in the normalization denominator.
81 gain_bits: Int, number of fractional bits in the gain.
82 enable_log: Bool, enable logarithmic scaling of filterbanks.
83 scale_shift: Integer, scale filterbanks by 2^(scale_shift).
84 left_context: Integer, number of preceding frames to attach to each frame.
85 right_context: Integer, number of preceding frames to attach to each frame.
86 frame_stride: Integer, M frames to skip over, where output[n] = frame[n*M].
87 zero_padding: Bool, if left/right context is out-of-bounds, attach frame of
88 zeroes. Otherwise, frame[0] or frame[size-1] will be copied.
89 out_scale: Integer, divide all filterbanks by this number.
90 out_type: DType, type of the output Tensor, defaults to UINT16.
91
92 Returns:
93 filterbanks: 2D Tensor, each row is a time frame, each column is a channel.
94
95 Raises:
96 ValueError: If the audio tensor is not explicitly a vector.
97 """
98 audio_shape = audio.shape
99 if audio_shape.ndims is None:
100 raise ValueError("Input to `AudioMicrofrontend` should have known rank.")
101 if len(audio_shape) > 1:
102 audio = array_ops.reshape(audio, [-1])
103
104 return gen_audio_microfrontend_op.audio_microfrontend(
105 audio, sample_rate, window_size, window_step, num_channels,
106 upper_band_limit, lower_band_limit, smoothing_bits, even_smoothing,
107 odd_smoothing, min_signal_remaining, enable_pcan, pcan_strength,
108 pcan_offset, gain_bits, enable_log, scale_shift, left_context,
109 right_context, frame_stride, zero_padding, out_scale, out_type)
110
111
112ops.NotDifferentiable("AudioMicrofrontend")