/src/mozilla-central/dom/media/webspeech/recognition/endpointer.cc
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright (c) 2013 The Chromium Authors. All rights reserved. |
2 | | // |
3 | | // Redistribution and use in source and binary forms, with or without |
4 | | // modification, are permitted provided that the following conditions are |
5 | | // met: |
6 | | // |
7 | | // * Redistributions of source code must retain the above copyright |
8 | | // notice, this list of conditions and the following disclaimer. |
9 | | // * Redistributions in binary form must reproduce the above |
10 | | // copyright notice, this list of conditions and the following disclaimer |
11 | | // in the documentation and/or other materials provided with the |
12 | | // distribution. |
13 | | // * Neither the name of Google Inc. nor the names of its |
14 | | // contributors may be used to endorse or promote products derived from |
15 | | // this software without specific prior written permission. |
16 | | // |
17 | | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
18 | | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
19 | | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
20 | | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
21 | | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
22 | | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
23 | | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
24 | | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
25 | | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
26 | | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
27 | | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
28 | | |
29 | | #include "endpointer.h" |
30 | | |
31 | | #include "AudioSegment.h" |
32 | | |
33 | | namespace { |
34 | | const int kFrameRate = 200; // 1 frame = 5ms of audio. |
35 | | } |
36 | | |
37 | | namespace mozilla { |
38 | | |
39 | | Endpointer::Endpointer(int sample_rate) |
40 | | : speech_input_possibly_complete_silence_length_us_(-1), |
41 | | speech_input_complete_silence_length_us_(-1), |
42 | | audio_frame_time_us_(0), |
43 | | sample_rate_(sample_rate), |
44 | 0 | frame_size_(0) { |
45 | 0 | Reset(); |
46 | 0 |
|
47 | 0 | frame_size_ = static_cast<int>(sample_rate / static_cast<float>(kFrameRate)); |
48 | 0 |
|
49 | 0 | speech_input_minimum_length_us_ = |
50 | 0 | static_cast<int64_t>(1.7 * 1000000); |
51 | 0 | speech_input_complete_silence_length_us_ = |
52 | 0 | static_cast<int64_t>(0.5 * 1000000); |
53 | 0 | long_speech_input_complete_silence_length_us_ = -1; |
54 | 0 | long_speech_length_us_ = -1; |
55 | 0 | speech_input_possibly_complete_silence_length_us_ = |
56 | 0 | 1 * 1000000; |
57 | 0 |
|
58 | 0 | // Set the default configuration for Push To Talk mode. |
59 | 0 | EnergyEndpointerParams ep_config; |
60 | 0 | ep_config.set_frame_period(1.0f / static_cast<float>(kFrameRate)); |
61 | 0 | ep_config.set_frame_duration(1.0f / static_cast<float>(kFrameRate)); |
62 | 0 | ep_config.set_endpoint_margin(0.2f); |
63 | 0 | ep_config.set_onset_window(0.15f); |
64 | 0 | ep_config.set_speech_on_window(0.4f); |
65 | 0 | ep_config.set_offset_window(0.15f); |
66 | 0 | ep_config.set_onset_detect_dur(0.09f); |
67 | 0 | ep_config.set_onset_confirm_dur(0.075f); |
68 | 0 | ep_config.set_on_maintain_dur(0.10f); |
69 | 0 | ep_config.set_offset_confirm_dur(0.12f); |
70 | 0 | ep_config.set_decision_threshold(1000.0f); |
71 | 0 | ep_config.set_min_decision_threshold(50.0f); |
72 | 0 | ep_config.set_fast_update_dur(0.2f); |
73 | 0 | ep_config.set_sample_rate(static_cast<float>(sample_rate)); |
74 | 0 | ep_config.set_min_fundamental_frequency(57.143f); |
75 | 0 | ep_config.set_max_fundamental_frequency(400.0f); |
76 | 0 | ep_config.set_contamination_rejection_period(0.25f); |
77 | 0 | energy_endpointer_.Init(ep_config); |
78 | 0 | } |
79 | | |
80 | 0 | void Endpointer::Reset() { |
81 | 0 | old_ep_status_ = EP_PRE_SPEECH; |
82 | 0 | waiting_for_speech_possibly_complete_timeout_ = false; |
83 | 0 | waiting_for_speech_complete_timeout_ = false; |
84 | 0 | speech_previously_detected_ = false; |
85 | 0 | speech_input_complete_ = false; |
86 | 0 | audio_frame_time_us_ = 0; // Reset time for packets sent to endpointer. |
87 | 0 | speech_end_time_us_ = -1; |
88 | 0 | speech_start_time_us_ = -1; |
89 | 0 | } |
90 | | |
91 | 0 | void Endpointer::StartSession() { |
92 | 0 | Reset(); |
93 | 0 | energy_endpointer_.StartSession(); |
94 | 0 | } |
95 | | |
96 | 0 | void Endpointer::EndSession() { |
97 | 0 | energy_endpointer_.EndSession(); |
98 | 0 | } |
99 | | |
100 | 0 | void Endpointer::SetEnvironmentEstimationMode() { |
101 | 0 | Reset(); |
102 | 0 | energy_endpointer_.SetEnvironmentEstimationMode(); |
103 | 0 | } |
104 | | |
105 | 0 | void Endpointer::SetUserInputMode() { |
106 | 0 | energy_endpointer_.SetUserInputMode(); |
107 | 0 | } |
108 | | |
109 | 0 | EpStatus Endpointer::Status(int64_t *time) { |
110 | 0 | return energy_endpointer_.Status(time); |
111 | 0 | } |
112 | | |
113 | 0 | EpStatus Endpointer::ProcessAudio(const AudioChunk& raw_audio, float* rms_out) { |
114 | 0 | MOZ_ASSERT(raw_audio.mBufferFormat == AUDIO_FORMAT_S16, "Audio is not in 16 bit format"); |
115 | 0 | const int16_t* audio_data = static_cast<const int16_t*>(raw_audio.mChannelData[0]); |
116 | 0 | const int num_samples = raw_audio.mDuration; |
117 | 0 | EpStatus ep_status = EP_PRE_SPEECH; |
118 | 0 |
|
119 | 0 | // Process the input data in blocks of frame_size_, dropping any incomplete |
120 | 0 | // frames at the end (which is ok since typically the caller will be recording |
121 | 0 | // audio in multiples of our frame size). |
122 | 0 | int sample_index = 0; |
123 | 0 | while (sample_index + frame_size_ <= num_samples) { |
124 | 0 | // Have the endpointer process the frame. |
125 | 0 | energy_endpointer_.ProcessAudioFrame(audio_frame_time_us_, |
126 | 0 | audio_data + sample_index, |
127 | 0 | frame_size_, |
128 | 0 | rms_out); |
129 | 0 | sample_index += frame_size_; |
130 | 0 | audio_frame_time_us_ += (frame_size_ * 1000000) / |
131 | 0 | sample_rate_; |
132 | 0 |
|
133 | 0 | // Get the status of the endpointer. |
134 | 0 | int64_t ep_time; |
135 | 0 | ep_status = energy_endpointer_.Status(&ep_time); |
136 | 0 | if (old_ep_status_ != ep_status) |
137 | 0 | fprintf(stderr, "Status changed old= %d, new= %d\n", old_ep_status_, ep_status); |
138 | 0 |
|
139 | 0 | // Handle state changes. |
140 | 0 | if ((EP_SPEECH_PRESENT == ep_status) && |
141 | 0 | (EP_POSSIBLE_ONSET == old_ep_status_)) { |
142 | 0 | speech_end_time_us_ = -1; |
143 | 0 | waiting_for_speech_possibly_complete_timeout_ = false; |
144 | 0 | waiting_for_speech_complete_timeout_ = false; |
145 | 0 | // Trigger SpeechInputDidStart event on first detection. |
146 | 0 | if (false == speech_previously_detected_) { |
147 | 0 | speech_previously_detected_ = true; |
148 | 0 | speech_start_time_us_ = ep_time; |
149 | 0 | } |
150 | 0 | } |
151 | 0 | if ((EP_PRE_SPEECH == ep_status) && |
152 | 0 | (EP_POSSIBLE_OFFSET == old_ep_status_)) { |
153 | 0 | speech_end_time_us_ = ep_time; |
154 | 0 | waiting_for_speech_possibly_complete_timeout_ = true; |
155 | 0 | waiting_for_speech_complete_timeout_ = true; |
156 | 0 | } |
157 | 0 | if (ep_time > speech_input_minimum_length_us_) { |
158 | 0 | // Speech possibly complete timeout. |
159 | 0 | if ((waiting_for_speech_possibly_complete_timeout_) && |
160 | 0 | (ep_time - speech_end_time_us_ > |
161 | 0 | speech_input_possibly_complete_silence_length_us_)) { |
162 | 0 | waiting_for_speech_possibly_complete_timeout_ = false; |
163 | 0 | } |
164 | 0 | if (waiting_for_speech_complete_timeout_) { |
165 | 0 | // The length of the silence timeout period can be held constant, or it |
166 | 0 | // can be changed after a fixed amount of time from the beginning of |
167 | 0 | // speech. |
168 | 0 | bool has_stepped_silence = |
169 | 0 | (long_speech_length_us_ > 0) && |
170 | 0 | (long_speech_input_complete_silence_length_us_ > 0); |
171 | 0 | int64_t requested_silence_length; |
172 | 0 | if (has_stepped_silence && |
173 | 0 | (ep_time - speech_start_time_us_) > long_speech_length_us_) { |
174 | 0 | requested_silence_length = |
175 | 0 | long_speech_input_complete_silence_length_us_; |
176 | 0 | } else { |
177 | 0 | requested_silence_length = |
178 | 0 | speech_input_complete_silence_length_us_; |
179 | 0 | } |
180 | 0 |
|
181 | 0 | // Speech complete timeout. |
182 | 0 | if ((ep_time - speech_end_time_us_) > requested_silence_length) { |
183 | 0 | waiting_for_speech_complete_timeout_ = false; |
184 | 0 | speech_input_complete_ = true; |
185 | 0 | } |
186 | 0 | } |
187 | 0 | } |
188 | 0 | old_ep_status_ = ep_status; |
189 | 0 | } |
190 | 0 | return ep_status; |
191 | 0 | } |
192 | | |
193 | | } // namespace mozilla |