/src/mozilla-central/dom/media/webspeech/recognition/energy_endpointer.cc

Source (jump to first uncovered line)
// Copyright (c) 2013 The Chromium Authors. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
//    * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//    * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following disclaimer
// in the documentation and/or other materials provided with the
// distribution.
//    * Neither the name of Google Inc. nor the names of its
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#include "energy_endpointer.h"

#include <math.h>

namespace {

// Returns the RMS (quadratic mean) of the input signal.
float RMS(const int16_t* samples, int num_samples) {
  int64_t ssq_int64_t = 0;
  int64_t sum_int64_t = 0;
  for (int i = 0; i < num_samples; ++i) {
    sum_int64_t += samples[i];
    ssq_int64_t += samples[i] * samples[i];
  }
  // now convert to floats.
  double sum = static_cast<double>(sum_int64_t);
  sum /= num_samples;
  double ssq = static_cast<double>(ssq_int64_t);
  return static_cast<float>(sqrt((ssq / num_samples) - (sum * sum)));
}

int64_t Secs2Usecs(float seconds) {
  return static_cast<int64_t>(0.5 + (1.0e6 * seconds));
}

float GetDecibel(float value) {
  if (value > 1.0e-100)
    return 20 * log10(value);
  return -2000.0;
}

}  // namespace

namespace mozilla {

// Stores threshold-crossing histories for making decisions about the speech
// state.
class EnergyEndpointer::HistoryRing {
 public:
  HistoryRing() : insertion_index_(0) {}

  // Resets the ring to |size| elements each with state |initial_state|
  void SetRing(int size, bool initial_state);

  // Inserts a new entry into the ring and drops the oldest entry.
  void Insert(int64_t time_us, bool decision);

  // Returns the time in microseconds of the most recently added entry.
  int64_t EndTime() const;

  // Returns the sum of all intervals during which 'decision' is true within
  // the time in seconds specified by 'duration'. The returned interval is
  // in seconds.
  float RingSum(float duration_sec);

 private:
  struct DecisionPoint {
    int64_t time_us;
    bool decision;
  };

  std::vector<DecisionPoint> decision_points_;
  int insertion_index_;  // Index at which the next item gets added/inserted.

  HistoryRing(const HistoryRing&);
  void operator=(const HistoryRing&);
};

void EnergyEndpointer::HistoryRing::SetRing(int size, bool initial_state) {
  insertion_index_ = 0;
  decision_points_.clear();
  DecisionPoint init = { -1, initial_state };
  decision_points_.resize(size, init);
}

void EnergyEndpointer::HistoryRing::Insert(int64_t time_us, bool decision) {
  decision_points_[insertion_index_].time_us = time_us;
  decision_points_[insertion_index_].decision = decision;
  insertion_index_ = (insertion_index_ + 1) % decision_points_.size();
}

int64_t EnergyEndpointer::HistoryRing::EndTime() const {
  int ind = insertion_index_ - 1;
  if (ind < 0)
    ind = decision_points_.size() - 1;
  return decision_points_[ind].time_us;
}

float EnergyEndpointer::HistoryRing::RingSum(float duration_sec) {
  if (decision_points_.empty())
    return 0.0;

  int64_t sum_us = 0;
  int ind = insertion_index_ - 1;
  if (ind < 0)
    ind = decision_points_.size() - 1;
  int64_t end_us = decision_points_[ind].time_us;
  bool is_on = decision_points_[ind].decision;
  int64_t start_us = end_us - static_cast<int64_t>(0.5 + (1.0e6 * duration_sec));
  if (start_us < 0)
    start_us = 0;
  size_t n_summed = 1;  // n points ==> (n-1) intervals
  while ((decision_points_[ind].time_us > start_us) &&
         (n_summed < decision_points_.size())) {
    --ind;
    if (ind < 0)
      ind = decision_points_.size() - 1;
    if (is_on)
      sum_us += end_us - decision_points_[ind].time_us;
    is_on = decision_points_[ind].decision;
    end_us = decision_points_[ind].time_us;
    n_summed++;
  }

  return 1.0e-6f * sum_us;  //  Returns total time that was super threshold.
}

EnergyEndpointer::EnergyEndpointer()
    : status_(EP_PRE_SPEECH),
      offset_confirm_dur_sec_(0),
      endpointer_time_us_(0),
      fast_update_frames_(0),
      frame_counter_(0),
      max_window_dur_(4.0),
      sample_rate_(0),
      history_(new HistoryRing()),
      decision_threshold_(0),
      estimating_environment_(false),
      noise_level_(0),
      rms_adapt_(0),
      start_lag_(0),
      end_lag_(0),
      user_input_start_time_us_(0) {
}

EnergyEndpointer::~EnergyEndpointer() {
}

int EnergyEndpointer::TimeToFrame(float time) const {
  return static_cast<int32_t>(0.5 + (time / params_.frame_period()));
}

void EnergyEndpointer::Restart(bool reset_threshold) {
  status_ = EP_PRE_SPEECH;
  user_input_start_time_us_ = 0;

  if (reset_threshold) {
    decision_threshold_ = params_.decision_threshold();
    rms_adapt_ = decision_threshold_;
    noise_level_ = params_.decision_threshold() / 2.0f;
    frame_counter_ = 0;  // Used for rapid initial update of levels.
  }

  // Set up the memories to hold the history windows.
  history_->SetRing(TimeToFrame(max_window_dur_), false);

  // Flag that indicates that current input should be used for
  // estimating the environment. The user has not yet started input
  // by e.g. pressed the push-to-talk button. By default, this is
  // false for backward compatibility.
  estimating_environment_ = false;
}

void EnergyEndpointer::Init(const EnergyEndpointerParams& params) {
  params_ = params;

  // Find the longest history interval to be used, and make the ring
  // large enough to accommodate that number of frames.  NOTE: This
  // depends upon ep_frame_period being set correctly in the factory
  // that did this instantiation.
  max_window_dur_ = params_.onset_window();
  if (params_.speech_on_window() > max_window_dur_)
    max_window_dur_ = params_.speech_on_window();
  if (params_.offset_window() > max_window_dur_)
    max_window_dur_ = params_.offset_window();
  Restart(true);

  offset_confirm_dur_sec_ = params_.offset_window() -
                            params_.offset_confirm_dur();
  if (offset_confirm_dur_sec_ < 0.0)
    offset_confirm_dur_sec_ = 0.0;

  user_input_start_time_us_ = 0;

  // Flag that indicates that  current input should be used for
  // estimating the environment. The user has not yet started input
  // by e.g. pressed the push-to-talk button. By default, this is
  // false for backward compatibility.
  estimating_environment_ = false;
  // The initial value of the noise and speech levels is inconsequential.
  // The level of the first frame will overwrite these values.
  noise_level_ = params_.decision_threshold() / 2.0f;
  fast_update_frames_ =
      static_cast<int64_t>(params_.fast_update_dur() / params_.frame_period());

  frame_counter_ = 0;  // Used for rapid initial update of levels.

  sample_rate_ = params_.sample_rate();
  start_lag_ = static_cast<int>(sample_rate_ /
                                params_.max_fundamental_frequency());
  end_lag_ = static_cast<int>(sample_rate_ /
                              params_.min_fundamental_frequency());
}

void EnergyEndpointer::StartSession() {
  Restart(true);
}

void EnergyEndpointer::EndSession() {
  status_ = EP_POST_SPEECH;
}

void EnergyEndpointer::SetEnvironmentEstimationMode() {
  Restart(true);
  estimating_environment_ = true;
}

void EnergyEndpointer::SetUserInputMode() {
  estimating_environment_ = false;
  user_input_start_time_us_ = endpointer_time_us_;
}

void EnergyEndpointer::ProcessAudioFrame(int64_t time_us,
                                         const int16_t* samples,
                                         int num_samples,
                                         float* rms_out) {
  endpointer_time_us_ = time_us;
  float rms = RMS(samples, num_samples);

  // Check that this is user input audio vs. pre-input adaptation audio.
  // Input audio starts when the user indicates start of input, by e.g.
  // pressing push-to-talk. Audio recieved prior to that is used to update
  // noise and speech level estimates.
  if (!estimating_environment_) {
    bool decision = false;
    if ((endpointer_time_us_ - user_input_start_time_us_) <
        Secs2Usecs(params_.contamination_rejection_period())) {
      decision = false;
      //PR_LOG(GetSpeechRecognitionLog(), PR_LOG_DEBUG, ("decision: forced to false, time: %d", endpointer_time_us_));
    } else {
      decision = (rms > decision_threshold_);
    }

    history_->Insert(endpointer_time_us_, decision);

    switch (status_) {
      case EP_PRE_SPEECH:
        if (history_->RingSum(params_.onset_window()) >
            params_.onset_detect_dur()) {
          status_ = EP_POSSIBLE_ONSET;
        }
        break;

      case EP_POSSIBLE_ONSET: {
        float tsum = history_->RingSum(params_.onset_window());
        if (tsum > params_.onset_confirm_dur()) {
          status_ = EP_SPEECH_PRESENT;
        } else {  // If signal is not maintained, drop back to pre-speech.
          if (tsum <= params_.onset_detect_dur())
            status_ = EP_PRE_SPEECH;
        }
        break;
      }

      case EP_SPEECH_PRESENT: {
        // To induce hysteresis in the state residency, we allow a
        // smaller residency time in the on_ring, than was required to
        // enter the SPEECH_PERSENT state.
        float on_time = history_->RingSum(params_.speech_on_window());
        if (on_time < params_.on_maintain_dur())
          status_ = EP_POSSIBLE_OFFSET;
        break;
      }

      case EP_POSSIBLE_OFFSET:
        if (history_->RingSum(params_.offset_window()) <=
            offset_confirm_dur_sec_) {
          // Note that this offset time may be beyond the end
          // of the input buffer in a real-time system.  It will be up
          // to the RecognizerSession to decide what to do.
          status_ = EP_PRE_SPEECH;  // Automatically reset for next utterance.
        } else {  // If speech picks up again we allow return to SPEECH_PRESENT.
          if (history_->RingSum(params_.speech_on_window()) >=
              params_.on_maintain_dur())
            status_ = EP_SPEECH_PRESENT;
        }
        break;

      default:
        break;
    }

    // If this is a quiet, non-speech region, slowly adapt the detection
    // threshold to be about 6dB above the average RMS.
    if ((!decision) && (status_ == EP_PRE_SPEECH)) {
      decision_threshold_ = (0.98f * decision_threshold_) + (0.02f * 2 * rms);
      rms_adapt_ = decision_threshold_;
    } else {
      // If this is in a speech region, adapt the decision threshold to
      // be about 10dB below the average RMS. If the noise level is high,
      // the threshold is pushed up.
      // Adaptation up to a higher level is 5 times faster than decay to
      // a lower level.
      if ((status_ == EP_SPEECH_PRESENT) && decision) {
        if (rms_adapt_ > rms) {
          rms_adapt_ = (0.99f * rms_adapt_) + (0.01f * rms);
        } else {
          rms_adapt_ = (0.95f * rms_adapt_) + (0.05f * rms);
        }
        float target_threshold = 0.3f * rms_adapt_ +  noise_level_;
        decision_threshold_ = (.90f * decision_threshold_) +
                              (0.10f * target_threshold);
      }
    }

    // Set a floor
    if (decision_threshold_ < params_.min_decision_threshold())
      decision_threshold_ = params_.min_decision_threshold();
  }

  // Update speech and noise levels.
  UpdateLevels(rms);
  ++frame_counter_;

  if (rms_out)
    *rms_out = GetDecibel(rms);
}

float EnergyEndpointer::GetNoiseLevelDb() const {
  return GetDecibel(noise_level_);
}

void EnergyEndpointer::UpdateLevels(float rms) {
  // Update quickly initially. We assume this is noise and that
  // speech is 6dB above the noise.
  if (frame_counter_ < fast_update_frames_) {
    // Alpha increases from 0 to (k-1)/k where k is the number of time
    // steps in the initial adaptation period.
    float alpha = static_cast<float>(frame_counter_) /
        static_cast<float>(fast_update_frames_);
    noise_level_ = (alpha * noise_level_) + ((1 - alpha) * rms);
    //PR_LOG(GetSpeechRecognitionLog(), PR_LOG_DEBUG, ("FAST UPDATE, frame_counter_ %d, fast_update_frames_ %d", frame_counter_, fast_update_frames_));
  } else {
    // Update Noise level. The noise level adapts quickly downward, but
    // slowly upward. The noise_level_ parameter is not currently used
    // for threshold adaptation. It is used for UI feedback.
    if (noise_level_ < rms)
      noise_level_ = (0.999f * noise_level_) + (0.001f * rms);
    else
      noise_level_ = (0.95f * noise_level_) + (0.05f * rms);
  }
  if (estimating_environment_ || (frame_counter_ < fast_update_frames_)) {
    decision_threshold_ = noise_level_ * 2; // 6dB above noise level.
    // Set a floor
    if (decision_threshold_ < params_.min_decision_threshold())
      decision_threshold_ = params_.min_decision_threshold();
  }
}

EpStatus EnergyEndpointer::Status(int64_t* status_time)  const {
  *status_time = history_->EndTime();
  return status_;
}

}  // namespace mozilla

Coverage Report

Created: 2018-09-25 14:53

Line	Count	Source (jump to first uncovered line)
1		// Copyright (c) 2013 The Chromium Authors. All rights reserved.
2		//
3		// Redistribution and use in source and binary forms, with or without
4		// modification, are permitted provided that the following conditions are
5		// met:
6		//
7		// * Redistributions of source code must retain the above copyright
8		// notice, this list of conditions and the following disclaimer.
9		// * Redistributions in binary form must reproduce the above
10		// copyright notice, this list of conditions and the following disclaimer
11		// in the documentation and/or other materials provided with the
12		// distribution.
13		// * Neither the name of Google Inc. nor the names of its
14		// contributors may be used to endorse or promote products derived from
15		// this software without specific prior written permission.
16		//
17		// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18		// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19		// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20		// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21		// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22		// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23		// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24		// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25		// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26		// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27		// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28
29		#include "energy_endpointer.h"
30
31		#include <math.h>
32
33		namespace {
34
35		// Returns the RMS (quadratic mean) of the input signal.
36	0	float RMS(const int16_t* samples, int num_samples) {
37	0	int64_t ssq_int64_t = 0;
38	0	int64_t sum_int64_t = 0;
39	0	for (int i = 0; i < num_samples; ++i) {
40	0	sum_int64_t += samples[i];
41	0	ssq_int64_t += samples[i] * samples[i];
42	0	}
43	0	// now convert to floats.
44	0	double sum = static_cast<double>(sum_int64_t);
45	0	sum /= num_samples;
46	0	double ssq = static_cast<double>(ssq_int64_t);
47	0	return static_cast<float>(sqrt((ssq / num_samples) - (sum * sum)));
48	0	}
49
50	0	int64_t Secs2Usecs(float seconds) {
51	0	return static_cast<int64_t>(0.5 + (1.0e6 * seconds));
52	0	}
53
54	0	float GetDecibel(float value) {
55	0	if (value > 1.0e-100)
56	0	return 20 * log10(value);
57	0	return -2000.0;
58	0	}
59
60		} // namespace
61
62		namespace mozilla {
63
64		// Stores threshold-crossing histories for making decisions about the speech
65		// state.
66		class EnergyEndpointer::HistoryRing {
67		public:
68	0	HistoryRing() : insertion_index_(0) {}
69
70		// Resets the ring to \|size\| elements each with state \|initial_state\|
71		void SetRing(int size, bool initial_state);
72
73		// Inserts a new entry into the ring and drops the oldest entry.
74		void Insert(int64_t time_us, bool decision);
75
76		// Returns the time in microseconds of the most recently added entry.
77		int64_t EndTime() const;
78
79		// Returns the sum of all intervals during which 'decision' is true within
80		// the time in seconds specified by 'duration'. The returned interval is
81		// in seconds.
82		float RingSum(float duration_sec);
83
84		private:
85		struct DecisionPoint {
86		int64_t time_us;
87		bool decision;
88		};
89
90		std::vector<DecisionPoint> decision_points_;
91		int insertion_index_; // Index at which the next item gets added/inserted.
92
93		HistoryRing(const HistoryRing&);
94		void operator=(const HistoryRing&);
95		};
96
97	0	void EnergyEndpointer::HistoryRing::SetRing(int size, bool initial_state) {
98	0	insertion_index_ = 0;
99	0	decision_points_.clear();
100	0	DecisionPoint init = { -1, initial_state };
101	0	decision_points_.resize(size, init);
102	0	}
103
104	0	void EnergyEndpointer::HistoryRing::Insert(int64_t time_us, bool decision) {
105	0	decision_points_[insertion_index_].time_us = time_us;
106	0	decision_points_[insertion_index_].decision = decision;
107	0	insertion_index_ = (insertion_index_ + 1) % decision_points_.size();
108	0	}
109
110	0	int64_t EnergyEndpointer::HistoryRing::EndTime() const {
111	0	int ind = insertion_index_ - 1;
112	0	if (ind < 0)
113	0	ind = decision_points_.size() - 1;
114	0	return decision_points_[ind].time_us;
115	0	}
116
117	0	float EnergyEndpointer::HistoryRing::RingSum(float duration_sec) {
118	0	if (decision_points_.empty())
119	0	return 0.0;
120	0
121	0	int64_t sum_us = 0;
122	0	int ind = insertion_index_ - 1;
123	0	if (ind < 0)
124	0	ind = decision_points_.size() - 1;
125	0	int64_t end_us = decision_points_[ind].time_us;
126	0	bool is_on = decision_points_[ind].decision;
127	0	int64_t start_us = end_us - static_cast<int64_t>(0.5 + (1.0e6 * duration_sec));
128	0	if (start_us < 0)
129	0	start_us = 0;
130	0	size_t n_summed = 1; // n points ==> (n-1) intervals
131	0	while ((decision_points_[ind].time_us > start_us) &&
132	0	(n_summed < decision_points_.size())) {
133	0	--ind;
134	0	if (ind < 0)
135	0	ind = decision_points_.size() - 1;
136	0	if (is_on)
137	0	sum_us += end_us - decision_points_[ind].time_us;
138	0	is_on = decision_points_[ind].decision;
139	0	end_us = decision_points_[ind].time_us;
140	0	n_summed++;
141	0	}
142	0
143	0	return 1.0e-6f * sum_us; // Returns total time that was super threshold.
144	0	}
145
146		EnergyEndpointer::EnergyEndpointer()
147		: status_(EP_PRE_SPEECH),
148		offset_confirm_dur_sec_(0),
149		endpointer_time_us_(0),
150		fast_update_frames_(0),
151		frame_counter_(0),
152		max_window_dur_(4.0),
153		sample_rate_(0),
154		history_(new HistoryRing()),
155		decision_threshold_(0),
156		estimating_environment_(false),
157		noise_level_(0),
158		rms_adapt_(0),
159		start_lag_(0),
160		end_lag_(0),
161	0	user_input_start_time_us_(0) {
162	0	}
163
164	0	EnergyEndpointer::~EnergyEndpointer() {
165	0	}
166
167	0	int EnergyEndpointer::TimeToFrame(float time) const {
168	0	return static_cast<int32_t>(0.5 + (time / params_.frame_period()));
169	0	}
170
171	0	void EnergyEndpointer::Restart(bool reset_threshold) {
172	0	status_ = EP_PRE_SPEECH;
173	0	user_input_start_time_us_ = 0;
174	0
175	0	if (reset_threshold) {
176	0	decision_threshold_ = params_.decision_threshold();
177	0	rms_adapt_ = decision_threshold_;
178	0	noise_level_ = params_.decision_threshold() / 2.0f;
179	0	frame_counter_ = 0; // Used for rapid initial update of levels.
180	0	}
181	0
182	0	// Set up the memories to hold the history windows.
183	0	history_->SetRing(TimeToFrame(max_window_dur_), false);
184	0
185	0	// Flag that indicates that current input should be used for
186	0	// estimating the environment. The user has not yet started input
187	0	// by e.g. pressed the push-to-talk button. By default, this is
188	0	// false for backward compatibility.
189	0	estimating_environment_ = false;
190	0	}
191
192	0	void EnergyEndpointer::Init(const EnergyEndpointerParams& params) {
193	0	params_ = params;
194	0
195	0	// Find the longest history interval to be used, and make the ring
196	0	// large enough to accommodate that number of frames. NOTE: This
197	0	// depends upon ep_frame_period being set correctly in the factory
198	0	// that did this instantiation.
199	0	max_window_dur_ = params_.onset_window();
200	0	if (params_.speech_on_window() > max_window_dur_)
201	0	max_window_dur_ = params_.speech_on_window();
202	0	if (params_.offset_window() > max_window_dur_)
203	0	max_window_dur_ = params_.offset_window();
204	0	Restart(true);
205	0
206	0	offset_confirm_dur_sec_ = params_.offset_window() -
207	0	params_.offset_confirm_dur();
208	0	if (offset_confirm_dur_sec_ < 0.0)
209	0	offset_confirm_dur_sec_ = 0.0;
210	0
211	0	user_input_start_time_us_ = 0;
212	0
213	0	// Flag that indicates that current input should be used for
214	0	// estimating the environment. The user has not yet started input
215	0	// by e.g. pressed the push-to-talk button. By default, this is
216	0	// false for backward compatibility.
217	0	estimating_environment_ = false;
218	0	// The initial value of the noise and speech levels is inconsequential.
219	0	// The level of the first frame will overwrite these values.
220	0	noise_level_ = params_.decision_threshold() / 2.0f;
221	0	fast_update_frames_ =
222	0	static_cast<int64_t>(params_.fast_update_dur() / params_.frame_period());
223	0
224	0	frame_counter_ = 0; // Used for rapid initial update of levels.
225	0
226	0	sample_rate_ = params_.sample_rate();
227	0	start_lag_ = static_cast<int>(sample_rate_ /
228	0	params_.max_fundamental_frequency());
229	0	end_lag_ = static_cast<int>(sample_rate_ /
230	0	params_.min_fundamental_frequency());
231	0	}
232
233	0	void EnergyEndpointer::StartSession() {
234	0	Restart(true);
235	0	}
236
237	0	void EnergyEndpointer::EndSession() {
238	0	status_ = EP_POST_SPEECH;
239	0	}
240
241	0	void EnergyEndpointer::SetEnvironmentEstimationMode() {
242	0	Restart(true);
243	0	estimating_environment_ = true;
244	0	}
245
246	0	void EnergyEndpointer::SetUserInputMode() {
247	0	estimating_environment_ = false;
248	0	user_input_start_time_us_ = endpointer_time_us_;
249	0	}
250
251		void EnergyEndpointer::ProcessAudioFrame(int64_t time_us,
252		const int16_t* samples,
253		int num_samples,
254	0	float* rms_out) {
255	0	endpointer_time_us_ = time_us;
256	0	float rms = RMS(samples, num_samples);
257	0
258	0	// Check that this is user input audio vs. pre-input adaptation audio.
259	0	// Input audio starts when the user indicates start of input, by e.g.
260	0	// pressing push-to-talk. Audio recieved prior to that is used to update
261	0	// noise and speech level estimates.
262	0	if (!estimating_environment_) {
263	0	bool decision = false;
264	0	if ((endpointer_time_us_ - user_input_start_time_us_) <
265	0	Secs2Usecs(params_.contamination_rejection_period())) {
266	0	decision = false;
267	0	//PR_LOG(GetSpeechRecognitionLog(), PR_LOG_DEBUG, ("decision: forced to false, time: %d", endpointer_time_us_));
268	0	} else {
269	0	decision = (rms > decision_threshold_);
270	0	}
271	0
272	0	history_->Insert(endpointer_time_us_, decision);
273	0
274	0	switch (status_) {
275	0	case EP_PRE_SPEECH:
276	0	if (history_->RingSum(params_.onset_window()) >
277	0	params_.onset_detect_dur()) {
278	0	status_ = EP_POSSIBLE_ONSET;
279	0	}
280	0	break;
281	0
282	0	case EP_POSSIBLE_ONSET: {
283	0	float tsum = history_->RingSum(params_.onset_window());
284	0	if (tsum > params_.onset_confirm_dur()) {
285	0	status_ = EP_SPEECH_PRESENT;
286	0	} else { // If signal is not maintained, drop back to pre-speech.
287	0	if (tsum <= params_.onset_detect_dur())
288	0	status_ = EP_PRE_SPEECH;
289	0	}
290	0	break;
291	0	}
292	0
293	0	case EP_SPEECH_PRESENT: {
294	0	// To induce hysteresis in the state residency, we allow a
295	0	// smaller residency time in the on_ring, than was required to
296	0	// enter the SPEECH_PERSENT state.
297	0	float on_time = history_->RingSum(params_.speech_on_window());
298	0	if (on_time < params_.on_maintain_dur())
299	0	status_ = EP_POSSIBLE_OFFSET;
300	0	break;
301	0	}
302	0
303	0	case EP_POSSIBLE_OFFSET:
304	0	if (history_->RingSum(params_.offset_window()) <=
305	0	offset_confirm_dur_sec_) {
306	0	// Note that this offset time may be beyond the end
307	0	// of the input buffer in a real-time system. It will be up
308	0	// to the RecognizerSession to decide what to do.
309	0	status_ = EP_PRE_SPEECH; // Automatically reset for next utterance.
310	0	} else { // If speech picks up again we allow return to SPEECH_PRESENT.
311	0	if (history_->RingSum(params_.speech_on_window()) >=
312	0	params_.on_maintain_dur())
313	0	status_ = EP_SPEECH_PRESENT;
314	0	}
315	0	break;
316	0
317	0	default:
318	0	break;
319	0	}
320	0
321	0	// If this is a quiet, non-speech region, slowly adapt the detection
322	0	// threshold to be about 6dB above the average RMS.
323	0	if ((!decision) && (status_ == EP_PRE_SPEECH)) {
324	0	decision_threshold_ = (0.98f * decision_threshold_) + (0.02f * 2 * rms);
325	0	rms_adapt_ = decision_threshold_;
326	0	} else {
327	0	// If this is in a speech region, adapt the decision threshold to
328	0	// be about 10dB below the average RMS. If the noise level is high,
329	0	// the threshold is pushed up.
330	0	// Adaptation up to a higher level is 5 times faster than decay to
331	0	// a lower level.
332	0	if ((status_ == EP_SPEECH_PRESENT) && decision) {
333	0	if (rms_adapt_ > rms) {
334	0	rms_adapt_ = (0.99f * rms_adapt_) + (0.01f * rms);
335	0	} else {
336	0	rms_adapt_ = (0.95f * rms_adapt_) + (0.05f * rms);
337	0	}
338	0	float target_threshold = 0.3f * rms_adapt_ + noise_level_;
339	0	decision_threshold_ = (.90f * decision_threshold_) +
340	0	(0.10f * target_threshold);
341	0	}
342	0	}
343	0
344	0	// Set a floor
345	0	if (decision_threshold_ < params_.min_decision_threshold())
346	0	decision_threshold_ = params_.min_decision_threshold();
347	0	}
348	0
349	0	// Update speech and noise levels.
350	0	UpdateLevels(rms);
351	0	++frame_counter_;
352	0
353	0	if (rms_out)
354	0	*rms_out = GetDecibel(rms);
355	0	}
356
357	0	float EnergyEndpointer::GetNoiseLevelDb() const {
358	0	return GetDecibel(noise_level_);
359	0	}
360
361	0	void EnergyEndpointer::UpdateLevels(float rms) {
362	0	// Update quickly initially. We assume this is noise and that
363	0	// speech is 6dB above the noise.
364	0	if (frame_counter_ < fast_update_frames_) {
365	0	// Alpha increases from 0 to (k-1)/k where k is the number of time
366	0	// steps in the initial adaptation period.
367	0	float alpha = static_cast<float>(frame_counter_) /
368	0	static_cast<float>(fast_update_frames_);
369	0	noise_level_ = (alpha * noise_level_) + ((1 - alpha) * rms);
370	0	//PR_LOG(GetSpeechRecognitionLog(), PR_LOG_DEBUG, ("FAST UPDATE, frame_counter_ %d, fast_update_frames_ %d", frame_counter_, fast_update_frames_));
371	0	} else {
372	0	// Update Noise level. The noise level adapts quickly downward, but
373	0	// slowly upward. The noise_level_ parameter is not currently used
374	0	// for threshold adaptation. It is used for UI feedback.
375	0	if (noise_level_ < rms)
376	0	noise_level_ = (0.999f * noise_level_) + (0.001f * rms);
377	0	else
378	0	noise_level_ = (0.95f * noise_level_) + (0.05f * rms);
379	0	}
380	0	if (estimating_environment_ \|\| (frame_counter_ < fast_update_frames_)) {
381	0	decision_threshold_ = noise_level_ * 2; // 6dB above noise level.
382	0	// Set a floor
383	0	if (decision_threshold_ < params_.min_decision_threshold())
384	0	decision_threshold_ = params_.min_decision_threshold();
385	0	}
386	0	}
387
388	0	EpStatus EnergyEndpointer::Status(int64_t* status_time) const {
389	0	*status_time = history_->EndTime();
390	0	return status_;
391	0	}
392
393		} // namespace mozilla