Coverage Report

Created: 2018-09-25 14:53

/src/mozilla-central/dom/media/webspeech/recognition/endpointer.h
Line
Count
Source (jump to first uncovered line)
1
// Copyright (c) 2013 The Chromium Authors. All rights reserved.
2
//
3
// Redistribution and use in source and binary forms, with or without
4
// modification, are permitted provided that the following conditions are
5
// met:
6
//
7
//    * Redistributions of source code must retain the above copyright
8
// notice, this list of conditions and the following disclaimer.
9
//    * Redistributions in binary form must reproduce the above
10
// copyright notice, this list of conditions and the following disclaimer
11
// in the documentation and/or other materials provided with the
12
// distribution.
13
//    * Neither the name of Google Inc. nor the names of its
14
// contributors may be used to endorse or promote products derived from
15
// this software without specific prior written permission.
16
//
17
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28
29
#ifndef CONTENT_BROWSER_SPEECH_ENDPOINTER_ENDPOINTER_H_
30
#define CONTENT_BROWSER_SPEECH_ENDPOINTER_ENDPOINTER_H_
31
32
#include "energy_endpointer.h"
33
34
namespace mozilla {
35
36
struct AudioChunk;
37
38
// A simple interface to the underlying energy-endpointer implementation, this
39
// class lets callers provide audio as being recorded and let them poll to find
40
// when the user has stopped speaking.
41
//
42
// There are two events that may trigger the end of speech:
43
//
44
// speechInputPossiblyComplete event:
45
//
46
// Signals that silence/noise has  been detected for a *short* amount of
47
// time after some speech has been detected. It can be used for low latency
48
// UI feedback. To disable it, set it to a large amount.
49
//
50
// speechInputComplete event:
51
//
52
// This event is intended to signal end of input and to stop recording.
53
// The amount of time to wait after speech is set by
54
// speech_input_complete_silence_length_ and optionally two other
55
// parameters (see below).
56
// This time can be held constant, or can change as more speech is detected.
57
// In the latter case, the time changes after a set amount of time from the
58
// *beginning* of speech.  This is motivated by the expectation that there
59
// will be two distinct types of inputs: short search queries and longer
60
// dictation style input.
61
//
62
// Three parameters are used to define the piecewise constant timeout function.
63
// The timeout length is speech_input_complete_silence_length until
64
// long_speech_length, when it changes to
65
// long_speech_input_complete_silence_length.
66
class Endpointer {
67
 public:
68
  explicit Endpointer(int sample_rate);
69
70
  // Start the endpointer. This should be called at the beginning of a session.
71
  void StartSession();
72
73
  // Stop the endpointer.
74
  void EndSession();
75
76
  // Start environment estimation. Audio will be used for environment estimation
77
  // i.e. noise level estimation.
78
  void SetEnvironmentEstimationMode();
79
80
  // Start user input. This should be called when the user indicates start of
81
  // input, e.g. by pressing a button.
82
  void SetUserInputMode();
83
84
  // Process a segment of audio, which may be more than one frame.
85
  // The status of the last frame will be returned.
86
  EpStatus ProcessAudio(const AudioChunk& raw_audio, float* rms_out);
87
88
  // Get the status of the endpointer.
89
  EpStatus Status(int64_t *time_us);
90
91
  // Get the expected frame size for audio chunks. Audio chunks are expected
92
  // to contain a number of samples that is a multiple of this number, and extra
93
  // samples will be dropped.
94
0
  int32_t FrameSize() const {
95
0
    return frame_size_;
96
0
  }
97
98
  // Returns true if the endpointer detected reasonable audio levels above
99
  // background noise which could be user speech, false if not.
100
0
  bool DidStartReceivingSpeech() const {
101
0
    return speech_previously_detected_;
102
0
  }
103
104
0
  bool IsEstimatingEnvironment() const {
105
0
    return energy_endpointer_.estimating_environment();
106
0
  }
107
108
0
  void set_speech_input_complete_silence_length(int64_t time_us) {
109
0
    speech_input_complete_silence_length_us_ = time_us;
110
0
  }
111
112
0
  void set_long_speech_input_complete_silence_length(int64_t time_us) {
113
0
    long_speech_input_complete_silence_length_us_ = time_us;
114
0
  }
115
116
0
  void set_speech_input_possibly_complete_silence_length(int64_t time_us) {
117
0
    speech_input_possibly_complete_silence_length_us_ = time_us;
118
0
  }
119
120
0
  void set_long_speech_length(int64_t time_us) {
121
0
    long_speech_length_us_ = time_us;
122
0
  }
123
124
0
  bool speech_input_complete() const {
125
0
    return speech_input_complete_;
126
0
  }
127
128
  // RMS background noise level in dB.
129
0
  float NoiseLevelDb() const { return energy_endpointer_.GetNoiseLevelDb(); }
130
131
 private:
132
  // Reset internal states. Helper method common to initial input utterance
133
  // and following input utternaces.
134
  void Reset();
135
136
  // Minimum allowable length of speech input.
137
  int64_t speech_input_minimum_length_us_;
138
139
  // The speechInputPossiblyComplete event signals that silence/noise has been
140
  // detected for a *short* amount of time after some speech has been detected.
141
  // This proporty specifies the time period.
142
  int64_t speech_input_possibly_complete_silence_length_us_;
143
144
  // The speechInputComplete event signals that silence/noise has been
145
  // detected for a *long* amount of time after some speech has been detected.
146
  // This property specifies the time period.
147
  int64_t speech_input_complete_silence_length_us_;
148
149
  // Same as above, this specifies the required silence period after speech
150
  // detection. This period is used instead of
151
  // speech_input_complete_silence_length_ when the utterance is longer than
152
  // long_speech_length_. This parameter is optional.
153
  int64_t long_speech_input_complete_silence_length_us_;
154
155
  // The period of time after which the endpointer should consider
156
  // long_speech_input_complete_silence_length_ as a valid silence period
157
  // instead of speech_input_complete_silence_length_. This parameter is
158
  // optional.
159
  int64_t long_speech_length_us_;
160
161
  // First speech onset time, used in determination of speech complete timeout.
162
  int64_t speech_start_time_us_;
163
164
  // Most recent end time, used in determination of speech complete timeout.
165
  int64_t speech_end_time_us_;
166
167
  int64_t audio_frame_time_us_;
168
  EpStatus old_ep_status_;
169
  bool waiting_for_speech_possibly_complete_timeout_;
170
  bool waiting_for_speech_complete_timeout_;
171
  bool speech_previously_detected_;
172
  bool speech_input_complete_;
173
  EnergyEndpointer energy_endpointer_;
174
  int sample_rate_;
175
  int32_t frame_size_;
176
};
177
178
}  // namespace mozilla
179
180
#endif  // CONTENT_BROWSER_SPEECH_ENDPOINTER_ENDPOINTER_H_