/src/mozilla-central/dom/media/webspeech/recognition/energy_endpointer.h

Source (jump to first uncovered line)
// Copyright (c) 2013 The Chromium Authors. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
//    * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//    * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following disclaimer
// in the documentation and/or other materials provided with the
// distribution.
//    * Neither the name of Google Inc. nor the names of its
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

// The EnergyEndpointer class finds likely speech onset and offset points.
//
// The implementation described here is about the simplest possible.
// It is based on timings of threshold crossings for overall signal
// RMS. It is suitable for light weight applications.
//
// As written, the basic idea is that one specifies intervals that
// must be occupied by super- and sub-threshold energy levels, and
// defers decisions re onset and offset times until these
// specifications have been met.  Three basic intervals are tested: an
// onset window, a speech-on window, and an offset window.  We require
// super-threshold to exceed some mimimum total durations in the onset
// and speech-on windows before declaring the speech onset time, and
// we specify a required sub-threshold residency in the offset window
// before declaring speech offset. As the various residency requirements are
// met, the EnergyEndpointer instance assumes various states, and can return the
// ID of these states to the client (see EpStatus below).
//
// The levels of the speech and background noise are continuously updated. It is
// important that the background noise level be estimated initially for
// robustness in noisy conditions. The first frames are assumed to be background
// noise and a fast update rate is used for the noise level. The duration for
// fast update is controlled by the fast_update_dur_ paramter.
//
// If used in noisy conditions, the endpointer should be started and run in the
// EnvironmentEstimation mode, for at least 200ms, before switching to
// UserInputMode.
// Audio feedback contamination can appear in the input audio, if not cut
// out or handled by echo cancellation. Audio feedback can trigger a false
// accept. The false accepts can be ignored by setting
// ep_contamination_rejection_period.

#ifndef CONTENT_BROWSER_SPEECH_ENDPOINTER_ENERGY_ENDPOINTER_H_
#define CONTENT_BROWSER_SPEECH_ENDPOINTER_ENERGY_ENDPOINTER_H_

#include <vector>

#include "nsAutoPtr.h"

#include "energy_endpointer_params.h"

namespace mozilla {

// Endpointer status codes
enum EpStatus {
  EP_PRE_SPEECH = 10,
  EP_POSSIBLE_ONSET,
  EP_SPEECH_PRESENT,
  EP_POSSIBLE_OFFSET,
  EP_POST_SPEECH,
};

class EnergyEndpointer {
 public:
  // The default construction MUST be followed by Init(), before any
  // other use can be made of the instance.
  EnergyEndpointer();
  virtual ~EnergyEndpointer();

  void Init(const EnergyEndpointerParams& params);

  // Start the endpointer. This should be called at the beginning of a session.
  void StartSession();

  // Stop the endpointer.
  void EndSession();

  // Start environment estimation. Audio will be used for environment estimation
  // i.e. noise level estimation.
  void SetEnvironmentEstimationMode();

  // Start user input. This should be called when the user indicates start of
  // input, e.g. by pressing a button.
  void SetUserInputMode();

  // Computes the next input frame and modifies EnergyEndpointer status as
  // appropriate based on the computation.
  void ProcessAudioFrame(int64_t time_us,
                         const int16_t* samples, int num_samples,
                         float* rms_out);

  // Returns the current state of the EnergyEndpointer and the time
  // corresponding to the most recently computed frame.
  EpStatus Status(int64_t* status_time_us) const;

  bool estimating_environment() const {
    return estimating_environment_;
  }

  // Returns estimated noise level in dB.
  float GetNoiseLevelDb() const;

 private:
  class HistoryRing;

  // Resets the endpointer internal state.  If reset_threshold is true, the
  // state will be reset completely, including adaptive thresholds and the
  // removal of all history information.
  void Restart(bool reset_threshold);

  // Update internal speech and noise levels.
  void UpdateLevels(float rms);

  // Returns the number of frames (or frame number) corresponding to
  // the 'time' (in seconds).
  int TimeToFrame(float time) const;

  EpStatus status_;  // The current state of this instance.
  float offset_confirm_dur_sec_;  // max on time allowed to confirm POST_SPEECH
  int64_t endpointer_time_us_;  // Time of the most recently received audio frame.
  int64_t fast_update_frames_; // Number of frames for initial level adaptation.
  int64_t frame_counter_;  // Number of frames seen. Used for initial adaptation.
  float max_window_dur_;  // Largest search window size (seconds)
  float sample_rate_;  // Sampling rate.

  // Ring buffers to hold the speech activity history.
  nsAutoPtr<HistoryRing> history_;

  // Configuration parameters.
  EnergyEndpointerParams params_;

  // RMS which must be exceeded to conclude frame is speech.
  float decision_threshold_;

  // Flag to indicate that audio should be used to estimate environment, prior
  // to receiving user input.
  bool estimating_environment_;

  // Estimate of the background noise level. Used externally for UI feedback.
  float noise_level_;

  // An adaptive threshold used to update decision_threshold_ when appropriate.
  float rms_adapt_;

  // Start lag corresponds to the highest fundamental frequency.
  int start_lag_;

  // End lag corresponds to the lowest fundamental frequency.
  int end_lag_;

  // Time when mode switched from environment estimation to user input. This
  // is used to time forced rejection of audio feedback contamination.
  int64_t user_input_start_time_us_;

  // prevent copy constructor and assignment
  EnergyEndpointer(const EnergyEndpointer&);
  void operator=(const EnergyEndpointer&);
};

}  // namespace mozilla

#endif  // CONTENT_BROWSER_SPEECH_ENDPOINTER_ENERGY_ENDPOINTER_H_

Line	Count	Source (jump to first uncovered line)
1		// Copyright (c) 2013 The Chromium Authors. All rights reserved.
2		//
3		// Redistribution and use in source and binary forms, with or without
4		// modification, are permitted provided that the following conditions are
5		// met:
6		//
7		// * Redistributions of source code must retain the above copyright
8		// notice, this list of conditions and the following disclaimer.
9		// * Redistributions in binary form must reproduce the above
10		// copyright notice, this list of conditions and the following disclaimer
11		// in the documentation and/or other materials provided with the
12		// distribution.
13		// * Neither the name of Google Inc. nor the names of its
14		// contributors may be used to endorse or promote products derived from
15		// this software without specific prior written permission.
16		//
17		// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18		// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19		// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20		// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21		// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22		// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23		// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24		// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25		// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26		// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27		// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28
29		// The EnergyEndpointer class finds likely speech onset and offset points.
30		//
31		// The implementation described here is about the simplest possible.
32		// It is based on timings of threshold crossings for overall signal
33		// RMS. It is suitable for light weight applications.
34		//
35		// As written, the basic idea is that one specifies intervals that
36		// must be occupied by super- and sub-threshold energy levels, and
37		// defers decisions re onset and offset times until these
38		// specifications have been met. Three basic intervals are tested: an
39		// onset window, a speech-on window, and an offset window. We require
40		// super-threshold to exceed some mimimum total durations in the onset
41		// and speech-on windows before declaring the speech onset time, and
42		// we specify a required sub-threshold residency in the offset window
43		// before declaring speech offset. As the various residency requirements are
44		// met, the EnergyEndpointer instance assumes various states, and can return the
45		// ID of these states to the client (see EpStatus below).
46		//
47		// The levels of the speech and background noise are continuously updated. It is
48		// important that the background noise level be estimated initially for
49		// robustness in noisy conditions. The first frames are assumed to be background
50		// noise and a fast update rate is used for the noise level. The duration for
51		// fast update is controlled by the fast_update_dur_ paramter.
52		//
53		// If used in noisy conditions, the endpointer should be started and run in the
54		// EnvironmentEstimation mode, for at least 200ms, before switching to
55		// UserInputMode.
56		// Audio feedback contamination can appear in the input audio, if not cut
57		// out or handled by echo cancellation. Audio feedback can trigger a false
58		// accept. The false accepts can be ignored by setting
59		// ep_contamination_rejection_period.
60
61		#ifndef CONTENT_BROWSER_SPEECH_ENDPOINTER_ENERGY_ENDPOINTER_H_
62		#define CONTENT_BROWSER_SPEECH_ENDPOINTER_ENERGY_ENDPOINTER_H_
63
64		#include <vector>
65
66		#include "nsAutoPtr.h"
67
68		#include "energy_endpointer_params.h"
69
70		namespace mozilla {
71
72		// Endpointer status codes
73		enum EpStatus {
74		EP_PRE_SPEECH = 10,
75		EP_POSSIBLE_ONSET,
76		EP_SPEECH_PRESENT,
77		EP_POSSIBLE_OFFSET,
78		EP_POST_SPEECH,
79		};
80
81		class EnergyEndpointer {
82		public:
83		// The default construction MUST be followed by Init(), before any
84		// other use can be made of the instance.
85		EnergyEndpointer();
86		virtual ~EnergyEndpointer();
87
88		void Init(const EnergyEndpointerParams& params);
89
90		// Start the endpointer. This should be called at the beginning of a session.
91		void StartSession();
92
93		// Stop the endpointer.
94		void EndSession();
95
96		// Start environment estimation. Audio will be used for environment estimation
97		// i.e. noise level estimation.
98		void SetEnvironmentEstimationMode();
99
100		// Start user input. This should be called when the user indicates start of
101		// input, e.g. by pressing a button.
102		void SetUserInputMode();
103
104		// Computes the next input frame and modifies EnergyEndpointer status as
105		// appropriate based on the computation.
106		void ProcessAudioFrame(int64_t time_us,
107		const int16_t* samples, int num_samples,
108		float* rms_out);
109
110		// Returns the current state of the EnergyEndpointer and the time
111		// corresponding to the most recently computed frame.
112		EpStatus Status(int64_t* status_time_us) const;
113
114	0	bool estimating_environment() const {
115	0	return estimating_environment_;
116	0	}
117
118		// Returns estimated noise level in dB.
119		float GetNoiseLevelDb() const;
120
121		private:
122		class HistoryRing;
123
124		// Resets the endpointer internal state. If reset_threshold is true, the
125		// state will be reset completely, including adaptive thresholds and the
126		// removal of all history information.
127		void Restart(bool reset_threshold);
128
129		// Update internal speech and noise levels.
130		void UpdateLevels(float rms);
131
132		// Returns the number of frames (or frame number) corresponding to
133		// the 'time' (in seconds).
134		int TimeToFrame(float time) const;
135
136		EpStatus status_; // The current state of this instance.
137		float offset_confirm_dur_sec_; // max on time allowed to confirm POST_SPEECH
138		int64_t endpointer_time_us_; // Time of the most recently received audio frame.
139		int64_t fast_update_frames_; // Number of frames for initial level adaptation.
140		int64_t frame_counter_; // Number of frames seen. Used for initial adaptation.
141		float max_window_dur_; // Largest search window size (seconds)
142		float sample_rate_; // Sampling rate.
143
144		// Ring buffers to hold the speech activity history.
145		nsAutoPtr<HistoryRing> history_;
146
147		// Configuration parameters.
148		EnergyEndpointerParams params_;
149
150		// RMS which must be exceeded to conclude frame is speech.
151		float decision_threshold_;
152
153		// Flag to indicate that audio should be used to estimate environment, prior
154		// to receiving user input.
155		bool estimating_environment_;
156
157		// Estimate of the background noise level. Used externally for UI feedback.
158		float noise_level_;
159
160		// An adaptive threshold used to update decision_threshold_ when appropriate.
161		float rms_adapt_;
162
163		// Start lag corresponds to the highest fundamental frequency.
164		int start_lag_;
165
166		// End lag corresponds to the lowest fundamental frequency.
167		int end_lag_;
168
169		// Time when mode switched from environment estimation to user input. This
170		// is used to time forced rejection of audio feedback contamination.
171		int64_t user_input_start_time_us_;
172
173		// prevent copy constructor and assignment
174		EnergyEndpointer(const EnergyEndpointer&);
175		void operator=(const EnergyEndpointer&);
176		};
177
178		} // namespace mozilla
179
180		#endif // CONTENT_BROWSER_SPEECH_ENDPOINTER_ENERGY_ENDPOINTER_H_

Coverage Report

Created: 2018-09-25 14:53