/src/mozilla-central/dom/media/webspeech/recognition/SpeechRecognition.h
Line | Count | Source (jump to first uncovered line) |
1 | | /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ |
2 | | /* vim:set ts=2 sw=2 sts=2 et cindent: */ |
3 | | /* This Source Code Form is subject to the terms of the Mozilla Public |
4 | | * License, v. 2.0. If a copy of the MPL was not distributed with this |
5 | | * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ |
6 | | |
7 | | #ifndef mozilla_dom_SpeechRecognition_h |
8 | | #define mozilla_dom_SpeechRecognition_h |
9 | | |
10 | | #include "mozilla/Attributes.h" |
11 | | #include "mozilla/DOMEventTargetHelper.h" |
12 | | #include "nsCOMPtr.h" |
13 | | #include "nsString.h" |
14 | | #include "nsWrapperCache.h" |
15 | | #include "nsTArray.h" |
16 | | #include "js/TypeDecls.h" |
17 | | |
18 | | #include "DOMMediaStream.h" |
19 | | #include "nsIDOMNavigatorUserMedia.h" |
20 | | #include "nsITimer.h" |
21 | | #include "MediaStreamGraph.h" |
22 | | #include "AudioSegment.h" |
23 | | #include "mozilla/WeakPtr.h" |
24 | | |
25 | | #include "SpeechGrammarList.h" |
26 | | #include "SpeechRecognitionResultList.h" |
27 | | #include "SpeechStreamListener.h" |
28 | | #include "nsISpeechRecognitionService.h" |
29 | | #include "endpointer.h" |
30 | | |
31 | | #include "mozilla/dom/BindingDeclarations.h" |
32 | | #include "mozilla/dom/SpeechRecognitionError.h" |
33 | | |
34 | | namespace mozilla { |
35 | | |
36 | | class DOMMediaStream; |
37 | | |
38 | | namespace dom { |
39 | | |
40 | 0 | #define SPEECH_RECOGNITION_TEST_EVENT_REQUEST_TOPIC "SpeechRecognitionTest:RequestEvent" |
41 | 0 | #define SPEECH_RECOGNITION_TEST_END_TOPIC "SpeechRecognitionTest:End" |
42 | | |
43 | | class GlobalObject; |
44 | | class SpeechEvent; |
45 | | |
46 | | LogModule* GetSpeechRecognitionLog(); |
47 | | #define SR_LOG(...) MOZ_LOG(GetSpeechRecognitionLog(), mozilla::LogLevel::Debug, (__VA_ARGS__)) |
48 | | |
49 | | class SpeechRecognition final : public DOMEventTargetHelper, |
50 | | public nsIObserver, |
51 | | public SupportsWeakPtr<SpeechRecognition> |
52 | | { |
53 | | public: |
54 | | MOZ_DECLARE_WEAKREFERENCE_TYPENAME(SpeechRecognition) |
55 | | explicit SpeechRecognition(nsPIDOMWindowInner* aOwnerWindow); |
56 | | |
57 | | NS_DECL_ISUPPORTS_INHERITED |
58 | | NS_DECL_CYCLE_COLLECTION_CLASS_INHERITED(SpeechRecognition, DOMEventTargetHelper) |
59 | | |
60 | | NS_DECL_NSIOBSERVER |
61 | | |
62 | | nsISupports* GetParentObject() const; |
63 | | |
64 | | JSObject* WrapObject(JSContext* aCx, JS::Handle<JSObject*> aGivenProto) override; |
65 | | |
66 | | static bool IsAuthorized(JSContext* aCx, JSObject* aGlobal); |
67 | | |
68 | | static already_AddRefed<SpeechRecognition> |
69 | | Constructor(const GlobalObject& aGlobal, ErrorResult& aRv); |
70 | | |
71 | | already_AddRefed<SpeechGrammarList> Grammars() const; |
72 | | |
73 | | void SetGrammars(mozilla::dom::SpeechGrammarList& aArg); |
74 | | |
75 | | void GetLang(nsString& aRetVal) const; |
76 | | |
77 | | void SetLang(const nsAString& aArg); |
78 | | |
79 | | bool GetContinuous(ErrorResult& aRv) const; |
80 | | |
81 | | void SetContinuous(bool aArg, ErrorResult& aRv); |
82 | | |
83 | | bool InterimResults() const; |
84 | | |
85 | | void SetInterimResults(bool aArg); |
86 | | |
87 | | uint32_t MaxAlternatives() const; |
88 | | |
89 | | void SetMaxAlternatives(uint32_t aArg); |
90 | | |
91 | | void GetServiceURI(nsString& aRetVal, ErrorResult& aRv) const; |
92 | | |
93 | | void SetServiceURI(const nsAString& aArg, ErrorResult& aRv); |
94 | | |
95 | | void Start(const Optional<NonNull<DOMMediaStream>>& aStream, |
96 | | CallerType aCallerType, ErrorResult& aRv); |
97 | | |
98 | | void Stop(); |
99 | | |
100 | | void Abort(); |
101 | | |
102 | | IMPL_EVENT_HANDLER(audiostart) |
103 | | IMPL_EVENT_HANDLER(soundstart) |
104 | | IMPL_EVENT_HANDLER(speechstart) |
105 | | IMPL_EVENT_HANDLER(speechend) |
106 | | IMPL_EVENT_HANDLER(soundend) |
107 | | IMPL_EVENT_HANDLER(audioend) |
108 | | IMPL_EVENT_HANDLER(result) |
109 | | IMPL_EVENT_HANDLER(nomatch) |
110 | | IMPL_EVENT_HANDLER(error) |
111 | | IMPL_EVENT_HANDLER(start) |
112 | | IMPL_EVENT_HANDLER(end) |
113 | | |
114 | | enum EventType { |
115 | | EVENT_START, |
116 | | EVENT_STOP, |
117 | | EVENT_ABORT, |
118 | | EVENT_AUDIO_DATA, |
119 | | EVENT_AUDIO_ERROR, |
120 | | EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT, |
121 | | EVENT_RECOGNITIONSERVICE_FINAL_RESULT, |
122 | | EVENT_RECOGNITIONSERVICE_ERROR, |
123 | | EVENT_COUNT |
124 | | }; |
125 | | |
126 | | void DispatchError(EventType aErrorType, SpeechRecognitionErrorCode aErrorCode, const nsAString& aMessage); |
127 | | uint32_t FillSamplesBuffer(const int16_t* aSamples, uint32_t aSampleCount); |
128 | | uint32_t SplitSamplesBuffer(const int16_t* aSamplesBuffer, uint32_t aSampleCount, nsTArray<RefPtr<SharedBuffer>>& aResult); |
129 | | AudioSegment* CreateAudioSegment(nsTArray<RefPtr<SharedBuffer>>& aChunks); |
130 | | void FeedAudioData(already_AddRefed<SharedBuffer> aSamples, uint32_t aDuration, MediaStreamListener* aProvider, TrackRate aTrackRate); |
131 | | |
132 | | friend class SpeechEvent; |
133 | | private: |
134 | 0 | virtual ~SpeechRecognition() {}; |
135 | | |
136 | | enum FSMState { |
137 | | STATE_IDLE, |
138 | | STATE_STARTING, |
139 | | STATE_ESTIMATING, |
140 | | STATE_WAITING_FOR_SPEECH, |
141 | | STATE_RECOGNIZING, |
142 | | STATE_WAITING_FOR_RESULT, |
143 | | STATE_COUNT |
144 | | }; |
145 | | |
146 | | void SetState(FSMState state); |
147 | | bool StateBetween(FSMState begin, FSMState end); |
148 | | |
149 | | bool SetRecognitionService(ErrorResult& aRv); |
150 | | bool ValidateAndSetGrammarList(ErrorResult& aRv); |
151 | | |
152 | | class GetUserMediaSuccessCallback : public nsIDOMGetUserMediaSuccessCallback |
153 | | { |
154 | | public: |
155 | | NS_DECL_ISUPPORTS |
156 | | NS_DECL_NSIDOMGETUSERMEDIASUCCESSCALLBACK |
157 | | |
158 | | explicit GetUserMediaSuccessCallback(SpeechRecognition* aRecognition) |
159 | | : mRecognition(aRecognition) |
160 | 0 | {} |
161 | | |
162 | | private: |
163 | 0 | virtual ~GetUserMediaSuccessCallback() {} |
164 | | |
165 | | RefPtr<SpeechRecognition> mRecognition; |
166 | | }; |
167 | | |
168 | | class GetUserMediaErrorCallback : public nsIDOMGetUserMediaErrorCallback |
169 | | { |
170 | | public: |
171 | | NS_DECL_ISUPPORTS |
172 | | NS_DECL_NSIDOMGETUSERMEDIAERRORCALLBACK |
173 | | |
174 | | explicit GetUserMediaErrorCallback(SpeechRecognition* aRecognition) |
175 | | : mRecognition(aRecognition) |
176 | 0 | {} |
177 | | |
178 | | private: |
179 | 0 | virtual ~GetUserMediaErrorCallback() {} |
180 | | |
181 | | RefPtr<SpeechRecognition> mRecognition; |
182 | | }; |
183 | | |
184 | | NS_IMETHOD StartRecording(DOMMediaStream* aDOMStream); |
185 | | NS_IMETHOD StopRecording(); |
186 | | |
187 | | uint32_t ProcessAudioSegment(AudioSegment* aSegment, TrackRate aTrackRate); |
188 | | void NotifyError(SpeechEvent* aEvent); |
189 | | |
190 | | void ProcessEvent(SpeechEvent* aEvent); |
191 | | void Transition(SpeechEvent* aEvent); |
192 | | |
193 | | void Reset(); |
194 | | void ResetAndEnd(); |
195 | | void WaitForAudioData(SpeechEvent* aEvent); |
196 | | void StartedAudioCapture(SpeechEvent* aEvent); |
197 | | void StopRecordingAndRecognize(SpeechEvent* aEvent); |
198 | | void WaitForEstimation(SpeechEvent* aEvent); |
199 | | void DetectSpeech(SpeechEvent* aEvent); |
200 | | void WaitForSpeechEnd(SpeechEvent* aEvent); |
201 | | void NotifyFinalResult(SpeechEvent* aEvent); |
202 | | void DoNothing(SpeechEvent* aEvent); |
203 | | void AbortSilently(SpeechEvent* aEvent); |
204 | | void AbortError(SpeechEvent* aEvent); |
205 | | |
206 | | RefPtr<DOMMediaStream> mDOMStream; |
207 | | RefPtr<SpeechStreamListener> mSpeechListener; |
208 | | nsCOMPtr<nsISpeechRecognitionService> mRecognitionService; |
209 | | |
210 | | FSMState mCurrentState; |
211 | | |
212 | | Endpointer mEndpointer; |
213 | | uint32_t mEstimationSamples; |
214 | | |
215 | | uint32_t mAudioSamplesPerChunk; |
216 | | |
217 | | // buffer holds one chunk of mAudioSamplesPerChunk |
218 | | // samples before feeding it to mEndpointer |
219 | | RefPtr<SharedBuffer> mAudioSamplesBuffer; |
220 | | uint32_t mBufferedSamples; |
221 | | |
222 | | nsCOMPtr<nsITimer> mSpeechDetectionTimer; |
223 | | bool mAborted; |
224 | | |
225 | | nsString mLang; |
226 | | |
227 | | RefPtr<SpeechGrammarList> mSpeechGrammarList; |
228 | | |
229 | | // WebSpeechAPI (http://bit.ly/1gIl7DC) states: |
230 | | // |
231 | | // 1. Default value MUST be false |
232 | | // 2. If true, interim results SHOULD be returned |
233 | | // 3. If false, interim results MUST NOT be returned |
234 | | // |
235 | | // Pocketsphinx does not return interm results; so, defaulting |
236 | | // mInterimResults to false, then ignoring its subsequent value |
237 | | // is a conforming implementation. |
238 | | bool mInterimResults; |
239 | | |
240 | | // WebSpeechAPI (http://bit.ly/1JAiqeo) states: |
241 | | // |
242 | | // 1. Default value is 1 |
243 | | // 2. Subsequent value is the "maximum number of SpeechRecognitionAlternatives per result" |
244 | | // |
245 | | // Pocketsphinx can only return at maximum a single SpeechRecognitionAlternative |
246 | | // per SpeechRecognitionResult. So defaulting mMaxAlternatives to 1, for all non |
247 | | // zero values ignoring mMaxAlternatives while for a 0 value returning no |
248 | | // SpeechRecognitionAlternative per result is a conforming implementation. |
249 | | uint32_t mMaxAlternatives; |
250 | | |
251 | | void ProcessTestEventRequest(nsISupports* aSubject, const nsAString& aEventName); |
252 | | |
253 | | const char* GetName(FSMState aId); |
254 | | const char* GetName(SpeechEvent* aId); |
255 | | }; |
256 | | |
257 | | class SpeechEvent : public Runnable |
258 | | { |
259 | | public: |
260 | | SpeechEvent(SpeechRecognition* aRecognition, |
261 | | SpeechRecognition::EventType aType) |
262 | | : Runnable("dom::SpeechEvent") |
263 | | , mAudioSegment(0) |
264 | | , mRecognitionResultList(nullptr) |
265 | | , mError(nullptr) |
266 | | , mRecognition(aRecognition) |
267 | | , mType(aType) |
268 | | , mTrackRate(0) |
269 | 0 | { |
270 | 0 | } |
271 | | |
272 | | ~SpeechEvent(); |
273 | | |
274 | | NS_IMETHOD Run() override; |
275 | | AudioSegment* mAudioSegment; |
276 | | RefPtr<SpeechRecognitionResultList> mRecognitionResultList; // TODO: make this a session being passed which also has index and stuff |
277 | | RefPtr<SpeechRecognitionError> mError; |
278 | | |
279 | | friend class SpeechRecognition; |
280 | | private: |
281 | | SpeechRecognition* mRecognition; |
282 | | |
283 | | // for AUDIO_DATA events, keep a reference to the provider |
284 | | // of the data (i.e., the SpeechStreamListener) to ensure it |
285 | | // is kept alive (and keeps SpeechRecognition alive) until this |
286 | | // event gets processed. |
287 | | RefPtr<MediaStreamListener> mProvider; |
288 | | SpeechRecognition::EventType mType; |
289 | | TrackRate mTrackRate; |
290 | | }; |
291 | | |
292 | | } // namespace dom |
293 | | |
294 | | inline nsISupports* |
295 | | ToSupports(dom::SpeechRecognition* aRec) |
296 | 0 | { |
297 | 0 | return ToSupports(static_cast<DOMEventTargetHelper*>(aRec)); |
298 | 0 | } |
299 | | |
300 | | } // namespace mozilla |
301 | | |
302 | | #endif |