/src/mozilla-central/dom/media/webspeech/recognition/energy_endpointer.h
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright (c) 2013 The Chromium Authors. All rights reserved. |
2 | | // |
3 | | // Redistribution and use in source and binary forms, with or without |
4 | | // modification, are permitted provided that the following conditions are |
5 | | // met: |
6 | | // |
7 | | // * Redistributions of source code must retain the above copyright |
8 | | // notice, this list of conditions and the following disclaimer. |
9 | | // * Redistributions in binary form must reproduce the above |
10 | | // copyright notice, this list of conditions and the following disclaimer |
11 | | // in the documentation and/or other materials provided with the |
12 | | // distribution. |
13 | | // * Neither the name of Google Inc. nor the names of its |
14 | | // contributors may be used to endorse or promote products derived from |
15 | | // this software without specific prior written permission. |
16 | | // |
17 | | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
18 | | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
19 | | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
20 | | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
21 | | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
22 | | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
23 | | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
24 | | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
25 | | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
26 | | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
27 | | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
28 | | |
29 | | // The EnergyEndpointer class finds likely speech onset and offset points. |
30 | | // |
31 | | // The implementation described here is about the simplest possible. |
32 | | // It is based on timings of threshold crossings for overall signal |
33 | | // RMS. It is suitable for light weight applications. |
34 | | // |
35 | | // As written, the basic idea is that one specifies intervals that |
36 | | // must be occupied by super- and sub-threshold energy levels, and |
37 | | // defers decisions re onset and offset times until these |
38 | | // specifications have been met. Three basic intervals are tested: an |
39 | | // onset window, a speech-on window, and an offset window. We require |
40 | | // super-threshold to exceed some mimimum total durations in the onset |
41 | | // and speech-on windows before declaring the speech onset time, and |
42 | | // we specify a required sub-threshold residency in the offset window |
43 | | // before declaring speech offset. As the various residency requirements are |
44 | | // met, the EnergyEndpointer instance assumes various states, and can return the |
45 | | // ID of these states to the client (see EpStatus below). |
46 | | // |
47 | | // The levels of the speech and background noise are continuously updated. It is |
48 | | // important that the background noise level be estimated initially for |
49 | | // robustness in noisy conditions. The first frames are assumed to be background |
50 | | // noise and a fast update rate is used for the noise level. The duration for |
51 | | // fast update is controlled by the fast_update_dur_ paramter. |
52 | | // |
53 | | // If used in noisy conditions, the endpointer should be started and run in the |
54 | | // EnvironmentEstimation mode, for at least 200ms, before switching to |
55 | | // UserInputMode. |
56 | | // Audio feedback contamination can appear in the input audio, if not cut |
57 | | // out or handled by echo cancellation. Audio feedback can trigger a false |
58 | | // accept. The false accepts can be ignored by setting |
59 | | // ep_contamination_rejection_period. |
60 | | |
61 | | #ifndef CONTENT_BROWSER_SPEECH_ENDPOINTER_ENERGY_ENDPOINTER_H_ |
62 | | #define CONTENT_BROWSER_SPEECH_ENDPOINTER_ENERGY_ENDPOINTER_H_ |
63 | | |
64 | | #include <vector> |
65 | | |
66 | | #include "nsAutoPtr.h" |
67 | | |
68 | | #include "energy_endpointer_params.h" |
69 | | |
70 | | namespace mozilla { |
71 | | |
72 | | // Endpointer status codes |
73 | | enum EpStatus { |
74 | | EP_PRE_SPEECH = 10, |
75 | | EP_POSSIBLE_ONSET, |
76 | | EP_SPEECH_PRESENT, |
77 | | EP_POSSIBLE_OFFSET, |
78 | | EP_POST_SPEECH, |
79 | | }; |
80 | | |
81 | | class EnergyEndpointer { |
82 | | public: |
83 | | // The default construction MUST be followed by Init(), before any |
84 | | // other use can be made of the instance. |
85 | | EnergyEndpointer(); |
86 | | virtual ~EnergyEndpointer(); |
87 | | |
88 | | void Init(const EnergyEndpointerParams& params); |
89 | | |
90 | | // Start the endpointer. This should be called at the beginning of a session. |
91 | | void StartSession(); |
92 | | |
93 | | // Stop the endpointer. |
94 | | void EndSession(); |
95 | | |
96 | | // Start environment estimation. Audio will be used for environment estimation |
97 | | // i.e. noise level estimation. |
98 | | void SetEnvironmentEstimationMode(); |
99 | | |
100 | | // Start user input. This should be called when the user indicates start of |
101 | | // input, e.g. by pressing a button. |
102 | | void SetUserInputMode(); |
103 | | |
104 | | // Computes the next input frame and modifies EnergyEndpointer status as |
105 | | // appropriate based on the computation. |
106 | | void ProcessAudioFrame(int64_t time_us, |
107 | | const int16_t* samples, int num_samples, |
108 | | float* rms_out); |
109 | | |
110 | | // Returns the current state of the EnergyEndpointer and the time |
111 | | // corresponding to the most recently computed frame. |
112 | | EpStatus Status(int64_t* status_time_us) const; |
113 | | |
114 | 0 | bool estimating_environment() const { |
115 | 0 | return estimating_environment_; |
116 | 0 | } |
117 | | |
118 | | // Returns estimated noise level in dB. |
119 | | float GetNoiseLevelDb() const; |
120 | | |
121 | | private: |
122 | | class HistoryRing; |
123 | | |
124 | | // Resets the endpointer internal state. If reset_threshold is true, the |
125 | | // state will be reset completely, including adaptive thresholds and the |
126 | | // removal of all history information. |
127 | | void Restart(bool reset_threshold); |
128 | | |
129 | | // Update internal speech and noise levels. |
130 | | void UpdateLevels(float rms); |
131 | | |
132 | | // Returns the number of frames (or frame number) corresponding to |
133 | | // the 'time' (in seconds). |
134 | | int TimeToFrame(float time) const; |
135 | | |
136 | | EpStatus status_; // The current state of this instance. |
137 | | float offset_confirm_dur_sec_; // max on time allowed to confirm POST_SPEECH |
138 | | int64_t endpointer_time_us_; // Time of the most recently received audio frame. |
139 | | int64_t fast_update_frames_; // Number of frames for initial level adaptation. |
140 | | int64_t frame_counter_; // Number of frames seen. Used for initial adaptation. |
141 | | float max_window_dur_; // Largest search window size (seconds) |
142 | | float sample_rate_; // Sampling rate. |
143 | | |
144 | | // Ring buffers to hold the speech activity history. |
145 | | nsAutoPtr<HistoryRing> history_; |
146 | | |
147 | | // Configuration parameters. |
148 | | EnergyEndpointerParams params_; |
149 | | |
150 | | // RMS which must be exceeded to conclude frame is speech. |
151 | | float decision_threshold_; |
152 | | |
153 | | // Flag to indicate that audio should be used to estimate environment, prior |
154 | | // to receiving user input. |
155 | | bool estimating_environment_; |
156 | | |
157 | | // Estimate of the background noise level. Used externally for UI feedback. |
158 | | float noise_level_; |
159 | | |
160 | | // An adaptive threshold used to update decision_threshold_ when appropriate. |
161 | | float rms_adapt_; |
162 | | |
163 | | // Start lag corresponds to the highest fundamental frequency. |
164 | | int start_lag_; |
165 | | |
166 | | // End lag corresponds to the lowest fundamental frequency. |
167 | | int end_lag_; |
168 | | |
169 | | // Time when mode switched from environment estimation to user input. This |
170 | | // is used to time forced rejection of audio feedback contamination. |
171 | | int64_t user_input_start_time_us_; |
172 | | |
173 | | // prevent copy constructor and assignment |
174 | | EnergyEndpointer(const EnergyEndpointer&); |
175 | | void operator=(const EnergyEndpointer&); |
176 | | }; |
177 | | |
178 | | } // namespace mozilla |
179 | | |
180 | | #endif // CONTENT_BROWSER_SPEECH_ENDPOINTER_ENERGY_ENDPOINTER_H_ |