/src/mozilla-central/dom/media/webspeech/recognition/energy_endpointer.cc
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright (c) 2013 The Chromium Authors. All rights reserved. |
2 | | // |
3 | | // Redistribution and use in source and binary forms, with or without |
4 | | // modification, are permitted provided that the following conditions are |
5 | | // met: |
6 | | // |
7 | | // * Redistributions of source code must retain the above copyright |
8 | | // notice, this list of conditions and the following disclaimer. |
9 | | // * Redistributions in binary form must reproduce the above |
10 | | // copyright notice, this list of conditions and the following disclaimer |
11 | | // in the documentation and/or other materials provided with the |
12 | | // distribution. |
13 | | // * Neither the name of Google Inc. nor the names of its |
14 | | // contributors may be used to endorse or promote products derived from |
15 | | // this software without specific prior written permission. |
16 | | // |
17 | | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
18 | | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
19 | | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
20 | | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
21 | | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
22 | | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
23 | | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
24 | | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
25 | | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
26 | | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
27 | | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
28 | | |
29 | | #include "energy_endpointer.h" |
30 | | |
31 | | #include <math.h> |
32 | | |
33 | | namespace { |
34 | | |
35 | | // Returns the RMS (quadratic mean) of the input signal. |
36 | 0 | float RMS(const int16_t* samples, int num_samples) { |
37 | 0 | int64_t ssq_int64_t = 0; |
38 | 0 | int64_t sum_int64_t = 0; |
39 | 0 | for (int i = 0; i < num_samples; ++i) { |
40 | 0 | sum_int64_t += samples[i]; |
41 | 0 | ssq_int64_t += samples[i] * samples[i]; |
42 | 0 | } |
43 | 0 | // now convert to floats. |
44 | 0 | double sum = static_cast<double>(sum_int64_t); |
45 | 0 | sum /= num_samples; |
46 | 0 | double ssq = static_cast<double>(ssq_int64_t); |
47 | 0 | return static_cast<float>(sqrt((ssq / num_samples) - (sum * sum))); |
48 | 0 | } |
49 | | |
50 | 0 | int64_t Secs2Usecs(float seconds) { |
51 | 0 | return static_cast<int64_t>(0.5 + (1.0e6 * seconds)); |
52 | 0 | } |
53 | | |
54 | 0 | float GetDecibel(float value) { |
55 | 0 | if (value > 1.0e-100) |
56 | 0 | return 20 * log10(value); |
57 | 0 | return -2000.0; |
58 | 0 | } |
59 | | |
60 | | } // namespace |
61 | | |
62 | | namespace mozilla { |
63 | | |
64 | | // Stores threshold-crossing histories for making decisions about the speech |
65 | | // state. |
66 | | class EnergyEndpointer::HistoryRing { |
67 | | public: |
68 | 0 | HistoryRing() : insertion_index_(0) {} |
69 | | |
70 | | // Resets the ring to |size| elements each with state |initial_state| |
71 | | void SetRing(int size, bool initial_state); |
72 | | |
73 | | // Inserts a new entry into the ring and drops the oldest entry. |
74 | | void Insert(int64_t time_us, bool decision); |
75 | | |
76 | | // Returns the time in microseconds of the most recently added entry. |
77 | | int64_t EndTime() const; |
78 | | |
79 | | // Returns the sum of all intervals during which 'decision' is true within |
80 | | // the time in seconds specified by 'duration'. The returned interval is |
81 | | // in seconds. |
82 | | float RingSum(float duration_sec); |
83 | | |
84 | | private: |
85 | | struct DecisionPoint { |
86 | | int64_t time_us; |
87 | | bool decision; |
88 | | }; |
89 | | |
90 | | std::vector<DecisionPoint> decision_points_; |
91 | | int insertion_index_; // Index at which the next item gets added/inserted. |
92 | | |
93 | | HistoryRing(const HistoryRing&); |
94 | | void operator=(const HistoryRing&); |
95 | | }; |
96 | | |
97 | 0 | void EnergyEndpointer::HistoryRing::SetRing(int size, bool initial_state) { |
98 | 0 | insertion_index_ = 0; |
99 | 0 | decision_points_.clear(); |
100 | 0 | DecisionPoint init = { -1, initial_state }; |
101 | 0 | decision_points_.resize(size, init); |
102 | 0 | } |
103 | | |
104 | 0 | void EnergyEndpointer::HistoryRing::Insert(int64_t time_us, bool decision) { |
105 | 0 | decision_points_[insertion_index_].time_us = time_us; |
106 | 0 | decision_points_[insertion_index_].decision = decision; |
107 | 0 | insertion_index_ = (insertion_index_ + 1) % decision_points_.size(); |
108 | 0 | } |
109 | | |
110 | 0 | int64_t EnergyEndpointer::HistoryRing::EndTime() const { |
111 | 0 | int ind = insertion_index_ - 1; |
112 | 0 | if (ind < 0) |
113 | 0 | ind = decision_points_.size() - 1; |
114 | 0 | return decision_points_[ind].time_us; |
115 | 0 | } |
116 | | |
117 | 0 | float EnergyEndpointer::HistoryRing::RingSum(float duration_sec) { |
118 | 0 | if (decision_points_.empty()) |
119 | 0 | return 0.0; |
120 | 0 | |
121 | 0 | int64_t sum_us = 0; |
122 | 0 | int ind = insertion_index_ - 1; |
123 | 0 | if (ind < 0) |
124 | 0 | ind = decision_points_.size() - 1; |
125 | 0 | int64_t end_us = decision_points_[ind].time_us; |
126 | 0 | bool is_on = decision_points_[ind].decision; |
127 | 0 | int64_t start_us = end_us - static_cast<int64_t>(0.5 + (1.0e6 * duration_sec)); |
128 | 0 | if (start_us < 0) |
129 | 0 | start_us = 0; |
130 | 0 | size_t n_summed = 1; // n points ==> (n-1) intervals |
131 | 0 | while ((decision_points_[ind].time_us > start_us) && |
132 | 0 | (n_summed < decision_points_.size())) { |
133 | 0 | --ind; |
134 | 0 | if (ind < 0) |
135 | 0 | ind = decision_points_.size() - 1; |
136 | 0 | if (is_on) |
137 | 0 | sum_us += end_us - decision_points_[ind].time_us; |
138 | 0 | is_on = decision_points_[ind].decision; |
139 | 0 | end_us = decision_points_[ind].time_us; |
140 | 0 | n_summed++; |
141 | 0 | } |
142 | 0 |
|
143 | 0 | return 1.0e-6f * sum_us; // Returns total time that was super threshold. |
144 | 0 | } |
145 | | |
146 | | EnergyEndpointer::EnergyEndpointer() |
147 | | : status_(EP_PRE_SPEECH), |
148 | | offset_confirm_dur_sec_(0), |
149 | | endpointer_time_us_(0), |
150 | | fast_update_frames_(0), |
151 | | frame_counter_(0), |
152 | | max_window_dur_(4.0), |
153 | | sample_rate_(0), |
154 | | history_(new HistoryRing()), |
155 | | decision_threshold_(0), |
156 | | estimating_environment_(false), |
157 | | noise_level_(0), |
158 | | rms_adapt_(0), |
159 | | start_lag_(0), |
160 | | end_lag_(0), |
161 | 0 | user_input_start_time_us_(0) { |
162 | 0 | } |
163 | | |
164 | 0 | EnergyEndpointer::~EnergyEndpointer() { |
165 | 0 | } |
166 | | |
167 | 0 | int EnergyEndpointer::TimeToFrame(float time) const { |
168 | 0 | return static_cast<int32_t>(0.5 + (time / params_.frame_period())); |
169 | 0 | } |
170 | | |
171 | 0 | void EnergyEndpointer::Restart(bool reset_threshold) { |
172 | 0 | status_ = EP_PRE_SPEECH; |
173 | 0 | user_input_start_time_us_ = 0; |
174 | 0 |
|
175 | 0 | if (reset_threshold) { |
176 | 0 | decision_threshold_ = params_.decision_threshold(); |
177 | 0 | rms_adapt_ = decision_threshold_; |
178 | 0 | noise_level_ = params_.decision_threshold() / 2.0f; |
179 | 0 | frame_counter_ = 0; // Used for rapid initial update of levels. |
180 | 0 | } |
181 | 0 |
|
182 | 0 | // Set up the memories to hold the history windows. |
183 | 0 | history_->SetRing(TimeToFrame(max_window_dur_), false); |
184 | 0 |
|
185 | 0 | // Flag that indicates that current input should be used for |
186 | 0 | // estimating the environment. The user has not yet started input |
187 | 0 | // by e.g. pressed the push-to-talk button. By default, this is |
188 | 0 | // false for backward compatibility. |
189 | 0 | estimating_environment_ = false; |
190 | 0 | } |
191 | | |
192 | 0 | void EnergyEndpointer::Init(const EnergyEndpointerParams& params) { |
193 | 0 | params_ = params; |
194 | 0 |
|
195 | 0 | // Find the longest history interval to be used, and make the ring |
196 | 0 | // large enough to accommodate that number of frames. NOTE: This |
197 | 0 | // depends upon ep_frame_period being set correctly in the factory |
198 | 0 | // that did this instantiation. |
199 | 0 | max_window_dur_ = params_.onset_window(); |
200 | 0 | if (params_.speech_on_window() > max_window_dur_) |
201 | 0 | max_window_dur_ = params_.speech_on_window(); |
202 | 0 | if (params_.offset_window() > max_window_dur_) |
203 | 0 | max_window_dur_ = params_.offset_window(); |
204 | 0 | Restart(true); |
205 | 0 |
|
206 | 0 | offset_confirm_dur_sec_ = params_.offset_window() - |
207 | 0 | params_.offset_confirm_dur(); |
208 | 0 | if (offset_confirm_dur_sec_ < 0.0) |
209 | 0 | offset_confirm_dur_sec_ = 0.0; |
210 | 0 |
|
211 | 0 | user_input_start_time_us_ = 0; |
212 | 0 |
|
213 | 0 | // Flag that indicates that current input should be used for |
214 | 0 | // estimating the environment. The user has not yet started input |
215 | 0 | // by e.g. pressed the push-to-talk button. By default, this is |
216 | 0 | // false for backward compatibility. |
217 | 0 | estimating_environment_ = false; |
218 | 0 | // The initial value of the noise and speech levels is inconsequential. |
219 | 0 | // The level of the first frame will overwrite these values. |
220 | 0 | noise_level_ = params_.decision_threshold() / 2.0f; |
221 | 0 | fast_update_frames_ = |
222 | 0 | static_cast<int64_t>(params_.fast_update_dur() / params_.frame_period()); |
223 | 0 |
|
224 | 0 | frame_counter_ = 0; // Used for rapid initial update of levels. |
225 | 0 |
|
226 | 0 | sample_rate_ = params_.sample_rate(); |
227 | 0 | start_lag_ = static_cast<int>(sample_rate_ / |
228 | 0 | params_.max_fundamental_frequency()); |
229 | 0 | end_lag_ = static_cast<int>(sample_rate_ / |
230 | 0 | params_.min_fundamental_frequency()); |
231 | 0 | } |
232 | | |
233 | 0 | void EnergyEndpointer::StartSession() { |
234 | 0 | Restart(true); |
235 | 0 | } |
236 | | |
237 | 0 | void EnergyEndpointer::EndSession() { |
238 | 0 | status_ = EP_POST_SPEECH; |
239 | 0 | } |
240 | | |
241 | 0 | void EnergyEndpointer::SetEnvironmentEstimationMode() { |
242 | 0 | Restart(true); |
243 | 0 | estimating_environment_ = true; |
244 | 0 | } |
245 | | |
246 | 0 | void EnergyEndpointer::SetUserInputMode() { |
247 | 0 | estimating_environment_ = false; |
248 | 0 | user_input_start_time_us_ = endpointer_time_us_; |
249 | 0 | } |
250 | | |
251 | | void EnergyEndpointer::ProcessAudioFrame(int64_t time_us, |
252 | | const int16_t* samples, |
253 | | int num_samples, |
254 | 0 | float* rms_out) { |
255 | 0 | endpointer_time_us_ = time_us; |
256 | 0 | float rms = RMS(samples, num_samples); |
257 | 0 |
|
258 | 0 | // Check that this is user input audio vs. pre-input adaptation audio. |
259 | 0 | // Input audio starts when the user indicates start of input, by e.g. |
260 | 0 | // pressing push-to-talk. Audio recieved prior to that is used to update |
261 | 0 | // noise and speech level estimates. |
262 | 0 | if (!estimating_environment_) { |
263 | 0 | bool decision = false; |
264 | 0 | if ((endpointer_time_us_ - user_input_start_time_us_) < |
265 | 0 | Secs2Usecs(params_.contamination_rejection_period())) { |
266 | 0 | decision = false; |
267 | 0 | //PR_LOG(GetSpeechRecognitionLog(), PR_LOG_DEBUG, ("decision: forced to false, time: %d", endpointer_time_us_)); |
268 | 0 | } else { |
269 | 0 | decision = (rms > decision_threshold_); |
270 | 0 | } |
271 | 0 |
|
272 | 0 | history_->Insert(endpointer_time_us_, decision); |
273 | 0 |
|
274 | 0 | switch (status_) { |
275 | 0 | case EP_PRE_SPEECH: |
276 | 0 | if (history_->RingSum(params_.onset_window()) > |
277 | 0 | params_.onset_detect_dur()) { |
278 | 0 | status_ = EP_POSSIBLE_ONSET; |
279 | 0 | } |
280 | 0 | break; |
281 | 0 |
|
282 | 0 | case EP_POSSIBLE_ONSET: { |
283 | 0 | float tsum = history_->RingSum(params_.onset_window()); |
284 | 0 | if (tsum > params_.onset_confirm_dur()) { |
285 | 0 | status_ = EP_SPEECH_PRESENT; |
286 | 0 | } else { // If signal is not maintained, drop back to pre-speech. |
287 | 0 | if (tsum <= params_.onset_detect_dur()) |
288 | 0 | status_ = EP_PRE_SPEECH; |
289 | 0 | } |
290 | 0 | break; |
291 | 0 | } |
292 | 0 |
|
293 | 0 | case EP_SPEECH_PRESENT: { |
294 | 0 | // To induce hysteresis in the state residency, we allow a |
295 | 0 | // smaller residency time in the on_ring, than was required to |
296 | 0 | // enter the SPEECH_PERSENT state. |
297 | 0 | float on_time = history_->RingSum(params_.speech_on_window()); |
298 | 0 | if (on_time < params_.on_maintain_dur()) |
299 | 0 | status_ = EP_POSSIBLE_OFFSET; |
300 | 0 | break; |
301 | 0 | } |
302 | 0 |
|
303 | 0 | case EP_POSSIBLE_OFFSET: |
304 | 0 | if (history_->RingSum(params_.offset_window()) <= |
305 | 0 | offset_confirm_dur_sec_) { |
306 | 0 | // Note that this offset time may be beyond the end |
307 | 0 | // of the input buffer in a real-time system. It will be up |
308 | 0 | // to the RecognizerSession to decide what to do. |
309 | 0 | status_ = EP_PRE_SPEECH; // Automatically reset for next utterance. |
310 | 0 | } else { // If speech picks up again we allow return to SPEECH_PRESENT. |
311 | 0 | if (history_->RingSum(params_.speech_on_window()) >= |
312 | 0 | params_.on_maintain_dur()) |
313 | 0 | status_ = EP_SPEECH_PRESENT; |
314 | 0 | } |
315 | 0 | break; |
316 | 0 |
|
317 | 0 | default: |
318 | 0 | break; |
319 | 0 | } |
320 | 0 | |
321 | 0 | // If this is a quiet, non-speech region, slowly adapt the detection |
322 | 0 | // threshold to be about 6dB above the average RMS. |
323 | 0 | if ((!decision) && (status_ == EP_PRE_SPEECH)) { |
324 | 0 | decision_threshold_ = (0.98f * decision_threshold_) + (0.02f * 2 * rms); |
325 | 0 | rms_adapt_ = decision_threshold_; |
326 | 0 | } else { |
327 | 0 | // If this is in a speech region, adapt the decision threshold to |
328 | 0 | // be about 10dB below the average RMS. If the noise level is high, |
329 | 0 | // the threshold is pushed up. |
330 | 0 | // Adaptation up to a higher level is 5 times faster than decay to |
331 | 0 | // a lower level. |
332 | 0 | if ((status_ == EP_SPEECH_PRESENT) && decision) { |
333 | 0 | if (rms_adapt_ > rms) { |
334 | 0 | rms_adapt_ = (0.99f * rms_adapt_) + (0.01f * rms); |
335 | 0 | } else { |
336 | 0 | rms_adapt_ = (0.95f * rms_adapt_) + (0.05f * rms); |
337 | 0 | } |
338 | 0 | float target_threshold = 0.3f * rms_adapt_ + noise_level_; |
339 | 0 | decision_threshold_ = (.90f * decision_threshold_) + |
340 | 0 | (0.10f * target_threshold); |
341 | 0 | } |
342 | 0 | } |
343 | 0 |
|
344 | 0 | // Set a floor |
345 | 0 | if (decision_threshold_ < params_.min_decision_threshold()) |
346 | 0 | decision_threshold_ = params_.min_decision_threshold(); |
347 | 0 | } |
348 | 0 |
|
349 | 0 | // Update speech and noise levels. |
350 | 0 | UpdateLevels(rms); |
351 | 0 | ++frame_counter_; |
352 | 0 |
|
353 | 0 | if (rms_out) |
354 | 0 | *rms_out = GetDecibel(rms); |
355 | 0 | } |
356 | | |
357 | 0 | float EnergyEndpointer::GetNoiseLevelDb() const { |
358 | 0 | return GetDecibel(noise_level_); |
359 | 0 | } |
360 | | |
361 | 0 | void EnergyEndpointer::UpdateLevels(float rms) { |
362 | 0 | // Update quickly initially. We assume this is noise and that |
363 | 0 | // speech is 6dB above the noise. |
364 | 0 | if (frame_counter_ < fast_update_frames_) { |
365 | 0 | // Alpha increases from 0 to (k-1)/k where k is the number of time |
366 | 0 | // steps in the initial adaptation period. |
367 | 0 | float alpha = static_cast<float>(frame_counter_) / |
368 | 0 | static_cast<float>(fast_update_frames_); |
369 | 0 | noise_level_ = (alpha * noise_level_) + ((1 - alpha) * rms); |
370 | 0 | //PR_LOG(GetSpeechRecognitionLog(), PR_LOG_DEBUG, ("FAST UPDATE, frame_counter_ %d, fast_update_frames_ %d", frame_counter_, fast_update_frames_)); |
371 | 0 | } else { |
372 | 0 | // Update Noise level. The noise level adapts quickly downward, but |
373 | 0 | // slowly upward. The noise_level_ parameter is not currently used |
374 | 0 | // for threshold adaptation. It is used for UI feedback. |
375 | 0 | if (noise_level_ < rms) |
376 | 0 | noise_level_ = (0.999f * noise_level_) + (0.001f * rms); |
377 | 0 | else |
378 | 0 | noise_level_ = (0.95f * noise_level_) + (0.05f * rms); |
379 | 0 | } |
380 | 0 | if (estimating_environment_ || (frame_counter_ < fast_update_frames_)) { |
381 | 0 | decision_threshold_ = noise_level_ * 2; // 6dB above noise level. |
382 | 0 | // Set a floor |
383 | 0 | if (decision_threshold_ < params_.min_decision_threshold()) |
384 | 0 | decision_threshold_ = params_.min_decision_threshold(); |
385 | 0 | } |
386 | 0 | } |
387 | | |
388 | 0 | EpStatus EnergyEndpointer::Status(int64_t* status_time) const { |
389 | 0 | *status_time = history_->EndTime(); |
390 | 0 | return status_; |
391 | 0 | } |
392 | | |
393 | | } // namespace mozilla |