Coverage Report

Created: 2018-09-25 14:53

/src/mozilla-central/dom/media/webspeech/recognition/energy_endpointer.cc
Line
Count
Source (jump to first uncovered line)
1
// Copyright (c) 2013 The Chromium Authors. All rights reserved.
2
//
3
// Redistribution and use in source and binary forms, with or without
4
// modification, are permitted provided that the following conditions are
5
// met:
6
//
7
//    * Redistributions of source code must retain the above copyright
8
// notice, this list of conditions and the following disclaimer.
9
//    * Redistributions in binary form must reproduce the above
10
// copyright notice, this list of conditions and the following disclaimer
11
// in the documentation and/or other materials provided with the
12
// distribution.
13
//    * Neither the name of Google Inc. nor the names of its
14
// contributors may be used to endorse or promote products derived from
15
// this software without specific prior written permission.
16
//
17
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28
29
#include "energy_endpointer.h"
30
31
#include <math.h>
32
33
namespace {
34
35
// Returns the RMS (quadratic mean) of the input signal.
36
0
float RMS(const int16_t* samples, int num_samples) {
37
0
  int64_t ssq_int64_t = 0;
38
0
  int64_t sum_int64_t = 0;
39
0
  for (int i = 0; i < num_samples; ++i) {
40
0
    sum_int64_t += samples[i];
41
0
    ssq_int64_t += samples[i] * samples[i];
42
0
  }
43
0
  // now convert to floats.
44
0
  double sum = static_cast<double>(sum_int64_t);
45
0
  sum /= num_samples;
46
0
  double ssq = static_cast<double>(ssq_int64_t);
47
0
  return static_cast<float>(sqrt((ssq / num_samples) - (sum * sum)));
48
0
}
49
50
0
int64_t Secs2Usecs(float seconds) {
51
0
  return static_cast<int64_t>(0.5 + (1.0e6 * seconds));
52
0
}
53
54
0
float GetDecibel(float value) {
55
0
  if (value > 1.0e-100)
56
0
    return 20 * log10(value);
57
0
  return -2000.0;
58
0
}
59
60
}  // namespace
61
62
namespace mozilla {
63
64
// Stores threshold-crossing histories for making decisions about the speech
65
// state.
66
class EnergyEndpointer::HistoryRing {
67
 public:
68
0
  HistoryRing() : insertion_index_(0) {}
69
70
  // Resets the ring to |size| elements each with state |initial_state|
71
  void SetRing(int size, bool initial_state);
72
73
  // Inserts a new entry into the ring and drops the oldest entry.
74
  void Insert(int64_t time_us, bool decision);
75
76
  // Returns the time in microseconds of the most recently added entry.
77
  int64_t EndTime() const;
78
79
  // Returns the sum of all intervals during which 'decision' is true within
80
  // the time in seconds specified by 'duration'. The returned interval is
81
  // in seconds.
82
  float RingSum(float duration_sec);
83
84
 private:
85
  struct DecisionPoint {
86
    int64_t time_us;
87
    bool decision;
88
  };
89
90
  std::vector<DecisionPoint> decision_points_;
91
  int insertion_index_;  // Index at which the next item gets added/inserted.
92
93
  HistoryRing(const HistoryRing&);
94
  void operator=(const HistoryRing&);
95
};
96
97
0
void EnergyEndpointer::HistoryRing::SetRing(int size, bool initial_state) {
98
0
  insertion_index_ = 0;
99
0
  decision_points_.clear();
100
0
  DecisionPoint init = { -1, initial_state };
101
0
  decision_points_.resize(size, init);
102
0
}
103
104
0
void EnergyEndpointer::HistoryRing::Insert(int64_t time_us, bool decision) {
105
0
  decision_points_[insertion_index_].time_us = time_us;
106
0
  decision_points_[insertion_index_].decision = decision;
107
0
  insertion_index_ = (insertion_index_ + 1) % decision_points_.size();
108
0
}
109
110
0
int64_t EnergyEndpointer::HistoryRing::EndTime() const {
111
0
  int ind = insertion_index_ - 1;
112
0
  if (ind < 0)
113
0
    ind = decision_points_.size() - 1;
114
0
  return decision_points_[ind].time_us;
115
0
}
116
117
0
float EnergyEndpointer::HistoryRing::RingSum(float duration_sec) {
118
0
  if (decision_points_.empty())
119
0
    return 0.0;
120
0
121
0
  int64_t sum_us = 0;
122
0
  int ind = insertion_index_ - 1;
123
0
  if (ind < 0)
124
0
    ind = decision_points_.size() - 1;
125
0
  int64_t end_us = decision_points_[ind].time_us;
126
0
  bool is_on = decision_points_[ind].decision;
127
0
  int64_t start_us = end_us - static_cast<int64_t>(0.5 + (1.0e6 * duration_sec));
128
0
  if (start_us < 0)
129
0
    start_us = 0;
130
0
  size_t n_summed = 1;  // n points ==> (n-1) intervals
131
0
  while ((decision_points_[ind].time_us > start_us) &&
132
0
         (n_summed < decision_points_.size())) {
133
0
    --ind;
134
0
    if (ind < 0)
135
0
      ind = decision_points_.size() - 1;
136
0
    if (is_on)
137
0
      sum_us += end_us - decision_points_[ind].time_us;
138
0
    is_on = decision_points_[ind].decision;
139
0
    end_us = decision_points_[ind].time_us;
140
0
    n_summed++;
141
0
  }
142
0
143
0
  return 1.0e-6f * sum_us;  //  Returns total time that was super threshold.
144
0
}
145
146
EnergyEndpointer::EnergyEndpointer()
147
    : status_(EP_PRE_SPEECH),
148
      offset_confirm_dur_sec_(0),
149
      endpointer_time_us_(0),
150
      fast_update_frames_(0),
151
      frame_counter_(0),
152
      max_window_dur_(4.0),
153
      sample_rate_(0),
154
      history_(new HistoryRing()),
155
      decision_threshold_(0),
156
      estimating_environment_(false),
157
      noise_level_(0),
158
      rms_adapt_(0),
159
      start_lag_(0),
160
      end_lag_(0),
161
0
      user_input_start_time_us_(0) {
162
0
}
163
164
0
EnergyEndpointer::~EnergyEndpointer() {
165
0
}
166
167
0
int EnergyEndpointer::TimeToFrame(float time) const {
168
0
  return static_cast<int32_t>(0.5 + (time / params_.frame_period()));
169
0
}
170
171
0
void EnergyEndpointer::Restart(bool reset_threshold) {
172
0
  status_ = EP_PRE_SPEECH;
173
0
  user_input_start_time_us_ = 0;
174
0
175
0
  if (reset_threshold) {
176
0
    decision_threshold_ = params_.decision_threshold();
177
0
    rms_adapt_ = decision_threshold_;
178
0
    noise_level_ = params_.decision_threshold() / 2.0f;
179
0
    frame_counter_ = 0;  // Used for rapid initial update of levels.
180
0
  }
181
0
182
0
  // Set up the memories to hold the history windows.
183
0
  history_->SetRing(TimeToFrame(max_window_dur_), false);
184
0
185
0
  // Flag that indicates that current input should be used for
186
0
  // estimating the environment. The user has not yet started input
187
0
  // by e.g. pressed the push-to-talk button. By default, this is
188
0
  // false for backward compatibility.
189
0
  estimating_environment_ = false;
190
0
}
191
192
0
void EnergyEndpointer::Init(const EnergyEndpointerParams& params) {
193
0
  params_ = params;
194
0
195
0
  // Find the longest history interval to be used, and make the ring
196
0
  // large enough to accommodate that number of frames.  NOTE: This
197
0
  // depends upon ep_frame_period being set correctly in the factory
198
0
  // that did this instantiation.
199
0
  max_window_dur_ = params_.onset_window();
200
0
  if (params_.speech_on_window() > max_window_dur_)
201
0
    max_window_dur_ = params_.speech_on_window();
202
0
  if (params_.offset_window() > max_window_dur_)
203
0
    max_window_dur_ = params_.offset_window();
204
0
  Restart(true);
205
0
206
0
  offset_confirm_dur_sec_ = params_.offset_window() -
207
0
                            params_.offset_confirm_dur();
208
0
  if (offset_confirm_dur_sec_ < 0.0)
209
0
    offset_confirm_dur_sec_ = 0.0;
210
0
211
0
  user_input_start_time_us_ = 0;
212
0
213
0
  // Flag that indicates that  current input should be used for
214
0
  // estimating the environment. The user has not yet started input
215
0
  // by e.g. pressed the push-to-talk button. By default, this is
216
0
  // false for backward compatibility.
217
0
  estimating_environment_ = false;
218
0
  // The initial value of the noise and speech levels is inconsequential.
219
0
  // The level of the first frame will overwrite these values.
220
0
  noise_level_ = params_.decision_threshold() / 2.0f;
221
0
  fast_update_frames_ =
222
0
      static_cast<int64_t>(params_.fast_update_dur() / params_.frame_period());
223
0
224
0
  frame_counter_ = 0;  // Used for rapid initial update of levels.
225
0
226
0
  sample_rate_ = params_.sample_rate();
227
0
  start_lag_ = static_cast<int>(sample_rate_ /
228
0
                                params_.max_fundamental_frequency());
229
0
  end_lag_ = static_cast<int>(sample_rate_ /
230
0
                              params_.min_fundamental_frequency());
231
0
}
232
233
0
void EnergyEndpointer::StartSession() {
234
0
  Restart(true);
235
0
}
236
237
0
void EnergyEndpointer::EndSession() {
238
0
  status_ = EP_POST_SPEECH;
239
0
}
240
241
0
void EnergyEndpointer::SetEnvironmentEstimationMode() {
242
0
  Restart(true);
243
0
  estimating_environment_ = true;
244
0
}
245
246
0
void EnergyEndpointer::SetUserInputMode() {
247
0
  estimating_environment_ = false;
248
0
  user_input_start_time_us_ = endpointer_time_us_;
249
0
}
250
251
void EnergyEndpointer::ProcessAudioFrame(int64_t time_us,
252
                                         const int16_t* samples,
253
                                         int num_samples,
254
0
                                         float* rms_out) {
255
0
  endpointer_time_us_ = time_us;
256
0
  float rms = RMS(samples, num_samples);
257
0
258
0
  // Check that this is user input audio vs. pre-input adaptation audio.
259
0
  // Input audio starts when the user indicates start of input, by e.g.
260
0
  // pressing push-to-talk. Audio recieved prior to that is used to update
261
0
  // noise and speech level estimates.
262
0
  if (!estimating_environment_) {
263
0
    bool decision = false;
264
0
    if ((endpointer_time_us_ - user_input_start_time_us_) <
265
0
        Secs2Usecs(params_.contamination_rejection_period())) {
266
0
      decision = false;
267
0
      //PR_LOG(GetSpeechRecognitionLog(), PR_LOG_DEBUG, ("decision: forced to false, time: %d", endpointer_time_us_));
268
0
    } else {
269
0
      decision = (rms > decision_threshold_);
270
0
    }
271
0
272
0
    history_->Insert(endpointer_time_us_, decision);
273
0
274
0
    switch (status_) {
275
0
      case EP_PRE_SPEECH:
276
0
        if (history_->RingSum(params_.onset_window()) >
277
0
            params_.onset_detect_dur()) {
278
0
          status_ = EP_POSSIBLE_ONSET;
279
0
        }
280
0
        break;
281
0
282
0
      case EP_POSSIBLE_ONSET: {
283
0
        float tsum = history_->RingSum(params_.onset_window());
284
0
        if (tsum > params_.onset_confirm_dur()) {
285
0
          status_ = EP_SPEECH_PRESENT;
286
0
        } else {  // If signal is not maintained, drop back to pre-speech.
287
0
          if (tsum <= params_.onset_detect_dur())
288
0
            status_ = EP_PRE_SPEECH;
289
0
        }
290
0
        break;
291
0
      }
292
0
293
0
      case EP_SPEECH_PRESENT: {
294
0
        // To induce hysteresis in the state residency, we allow a
295
0
        // smaller residency time in the on_ring, than was required to
296
0
        // enter the SPEECH_PERSENT state.
297
0
        float on_time = history_->RingSum(params_.speech_on_window());
298
0
        if (on_time < params_.on_maintain_dur())
299
0
          status_ = EP_POSSIBLE_OFFSET;
300
0
        break;
301
0
      }
302
0
303
0
      case EP_POSSIBLE_OFFSET:
304
0
        if (history_->RingSum(params_.offset_window()) <=
305
0
            offset_confirm_dur_sec_) {
306
0
          // Note that this offset time may be beyond the end
307
0
          // of the input buffer in a real-time system.  It will be up
308
0
          // to the RecognizerSession to decide what to do.
309
0
          status_ = EP_PRE_SPEECH;  // Automatically reset for next utterance.
310
0
        } else {  // If speech picks up again we allow return to SPEECH_PRESENT.
311
0
          if (history_->RingSum(params_.speech_on_window()) >=
312
0
              params_.on_maintain_dur())
313
0
            status_ = EP_SPEECH_PRESENT;
314
0
        }
315
0
        break;
316
0
317
0
      default:
318
0
        break;
319
0
    }
320
0
321
0
    // If this is a quiet, non-speech region, slowly adapt the detection
322
0
    // threshold to be about 6dB above the average RMS.
323
0
    if ((!decision) && (status_ == EP_PRE_SPEECH)) {
324
0
      decision_threshold_ = (0.98f * decision_threshold_) + (0.02f * 2 * rms);
325
0
      rms_adapt_ = decision_threshold_;
326
0
    } else {
327
0
      // If this is in a speech region, adapt the decision threshold to
328
0
      // be about 10dB below the average RMS. If the noise level is high,
329
0
      // the threshold is pushed up.
330
0
      // Adaptation up to a higher level is 5 times faster than decay to
331
0
      // a lower level.
332
0
      if ((status_ == EP_SPEECH_PRESENT) && decision) {
333
0
        if (rms_adapt_ > rms) {
334
0
          rms_adapt_ = (0.99f * rms_adapt_) + (0.01f * rms);
335
0
        } else {
336
0
          rms_adapt_ = (0.95f * rms_adapt_) + (0.05f * rms);
337
0
        }
338
0
        float target_threshold = 0.3f * rms_adapt_ +  noise_level_;
339
0
        decision_threshold_ = (.90f * decision_threshold_) +
340
0
                              (0.10f * target_threshold);
341
0
      }
342
0
    }
343
0
344
0
    // Set a floor
345
0
    if (decision_threshold_ < params_.min_decision_threshold())
346
0
      decision_threshold_ = params_.min_decision_threshold();
347
0
  }
348
0
349
0
  // Update speech and noise levels.
350
0
  UpdateLevels(rms);
351
0
  ++frame_counter_;
352
0
353
0
  if (rms_out)
354
0
    *rms_out = GetDecibel(rms);
355
0
}
356
357
0
float EnergyEndpointer::GetNoiseLevelDb() const {
358
0
  return GetDecibel(noise_level_);
359
0
}
360
361
0
void EnergyEndpointer::UpdateLevels(float rms) {
362
0
  // Update quickly initially. We assume this is noise and that
363
0
  // speech is 6dB above the noise.
364
0
  if (frame_counter_ < fast_update_frames_) {
365
0
    // Alpha increases from 0 to (k-1)/k where k is the number of time
366
0
    // steps in the initial adaptation period.
367
0
    float alpha = static_cast<float>(frame_counter_) /
368
0
        static_cast<float>(fast_update_frames_);
369
0
    noise_level_ = (alpha * noise_level_) + ((1 - alpha) * rms);
370
0
    //PR_LOG(GetSpeechRecognitionLog(), PR_LOG_DEBUG, ("FAST UPDATE, frame_counter_ %d, fast_update_frames_ %d", frame_counter_, fast_update_frames_));
371
0
  } else {
372
0
    // Update Noise level. The noise level adapts quickly downward, but
373
0
    // slowly upward. The noise_level_ parameter is not currently used
374
0
    // for threshold adaptation. It is used for UI feedback.
375
0
    if (noise_level_ < rms)
376
0
      noise_level_ = (0.999f * noise_level_) + (0.001f * rms);
377
0
    else
378
0
      noise_level_ = (0.95f * noise_level_) + (0.05f * rms);
379
0
  }
380
0
  if (estimating_environment_ || (frame_counter_ < fast_update_frames_)) {
381
0
    decision_threshold_ = noise_level_ * 2; // 6dB above noise level.
382
0
    // Set a floor
383
0
    if (decision_threshold_ < params_.min_decision_threshold())
384
0
      decision_threshold_ = params_.min_decision_threshold();
385
0
  }
386
0
}
387
388
0
EpStatus EnergyEndpointer::Status(int64_t* status_time)  const {
389
0
  *status_time = history_->EndTime();
390
0
  return status_;
391
0
}
392
393
}  // namespace mozilla