/src/mozilla-central/dom/media/webspeech/recognition/SpeechRecognition.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ |
2 | | /* vim:set ts=2 sw=2 sts=2 et cindent: */ |
3 | | /* This Source Code Form is subject to the terms of the Mozilla Public |
4 | | * License, v. 2.0. If a copy of the MPL was not distributed with this |
5 | | * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ |
6 | | |
7 | | #include "SpeechRecognition.h" |
8 | | |
9 | | #include "nsCOMPtr.h" |
10 | | #include "nsCycleCollectionParticipant.h" |
11 | | |
12 | | #include "mozilla/dom/BindingUtils.h" |
13 | | #include "mozilla/dom/Element.h" |
14 | | #include "mozilla/dom/SpeechRecognitionBinding.h" |
15 | | #include "mozilla/dom/MediaStreamTrackBinding.h" |
16 | | #include "mozilla/dom/MediaStreamError.h" |
17 | | #include "mozilla/MediaManager.h" |
18 | | #include "mozilla/Preferences.h" |
19 | | #include "mozilla/Services.h" |
20 | | #include "mozilla/StaticPrefs.h" |
21 | | |
22 | | #include "AudioSegment.h" |
23 | | #include "DOMMediaStream.h" |
24 | | #include "MediaEnginePrefs.h" |
25 | | #include "endpointer.h" |
26 | | |
27 | | #include "mozilla/dom/SpeechRecognitionEvent.h" |
28 | | #include "nsContentUtils.h" |
29 | | #include "nsIDocument.h" |
30 | | #include "nsIObserverService.h" |
31 | | #include "nsIPermissionManager.h" |
32 | | #include "nsIPrincipal.h" |
33 | | #include "nsPIDOMWindow.h" |
34 | | #include "nsServiceManagerUtils.h" |
35 | | #include "nsQueryObject.h" |
36 | | |
37 | | #include <algorithm> |
38 | | |
39 | | // Undo the windows.h damage |
40 | | #if defined(XP_WIN) && defined(GetMessage) |
41 | | #undef GetMessage |
42 | | #endif |
43 | | |
44 | | namespace mozilla { |
45 | | namespace dom { |
46 | | |
47 | 0 | #define PREFERENCE_DEFAULT_RECOGNITION_SERVICE "media.webspeech.service.default" |
48 | | #define DEFAULT_RECOGNITION_SERVICE_PREFIX "pocketsphinx-" |
49 | 0 | #define DEFAULT_RECOGNITION_SERVICE "pocketsphinx-en-US" |
50 | | |
51 | 0 | #define PREFERENCE_ENDPOINTER_SILENCE_LENGTH "media.webspeech.silence_length" |
52 | 0 | #define PREFERENCE_ENDPOINTER_LONG_SILENCE_LENGTH "media.webspeech.long_silence_length" |
53 | | #define PREFERENCE_ENDPOINTER_LONG_SPEECH_LENGTH "media.webspeech.long_speech_length" |
54 | | |
55 | | static const uint32_t kSAMPLE_RATE = 16000; |
56 | | static const uint32_t kSPEECH_DETECTION_TIMEOUT_MS = 10000; |
57 | | |
58 | | // number of frames corresponding to 300ms of audio to send to endpointer while |
59 | | // it's in environment estimation mode |
60 | | // kSAMPLE_RATE frames = 1s, kESTIMATION_FRAMES frames = 300ms |
61 | | static const uint32_t kESTIMATION_SAMPLES = 300 * kSAMPLE_RATE / 1000; |
62 | | |
63 | | LogModule* |
64 | | GetSpeechRecognitionLog() |
65 | 0 | { |
66 | 0 | static LazyLogModule sLog("SpeechRecognition"); |
67 | 0 | return sLog; |
68 | 0 | } |
69 | 0 | #define SR_LOG(...) MOZ_LOG(GetSpeechRecognitionLog(), mozilla::LogLevel::Debug, (__VA_ARGS__)) |
70 | | |
71 | | already_AddRefed<nsISpeechRecognitionService> |
72 | | GetSpeechRecognitionService(const nsAString& aLang) |
73 | 0 | { |
74 | 0 | nsAutoCString speechRecognitionServiceCID; |
75 | 0 |
|
76 | 0 | nsAutoCString prefValue; |
77 | 0 | Preferences::GetCString(PREFERENCE_DEFAULT_RECOGNITION_SERVICE, prefValue); |
78 | 0 | nsAutoCString speechRecognitionService; |
79 | 0 |
|
80 | 0 | if (!aLang.IsEmpty()) { |
81 | 0 | speechRecognitionService = |
82 | 0 | NS_LITERAL_CSTRING(DEFAULT_RECOGNITION_SERVICE_PREFIX) + |
83 | 0 | NS_ConvertUTF16toUTF8(aLang); |
84 | 0 | } else if (!prefValue.IsEmpty()) { |
85 | 0 | speechRecognitionService = prefValue; |
86 | 0 | } else { |
87 | 0 | speechRecognitionService = DEFAULT_RECOGNITION_SERVICE; |
88 | 0 | } |
89 | 0 |
|
90 | 0 | if (StaticPrefs::MediaWebspeechTextFakeRecognitionService()) { |
91 | 0 | speechRecognitionServiceCID = |
92 | 0 | NS_SPEECH_RECOGNITION_SERVICE_CONTRACTID_PREFIX "fake"; |
93 | 0 | } else { |
94 | 0 | speechRecognitionServiceCID = |
95 | 0 | NS_LITERAL_CSTRING(NS_SPEECH_RECOGNITION_SERVICE_CONTRACTID_PREFIX) + |
96 | 0 | speechRecognitionService; |
97 | 0 | } |
98 | 0 |
|
99 | 0 | nsresult rv; |
100 | 0 | nsCOMPtr<nsISpeechRecognitionService> recognitionService; |
101 | 0 | recognitionService = do_GetService(speechRecognitionServiceCID.get(), &rv); |
102 | 0 | return recognitionService.forget(); |
103 | 0 | } |
104 | | |
105 | | NS_IMPL_CYCLE_COLLECTION_INHERITED(SpeechRecognition, |
106 | | DOMEventTargetHelper, |
107 | | mDOMStream, |
108 | | mSpeechGrammarList) |
109 | | |
110 | 0 | NS_INTERFACE_MAP_BEGIN_CYCLE_COLLECTION(SpeechRecognition) |
111 | 0 | NS_INTERFACE_MAP_ENTRY(nsIObserver) |
112 | 0 | NS_INTERFACE_MAP_END_INHERITING(DOMEventTargetHelper) |
113 | | |
114 | | NS_IMPL_ADDREF_INHERITED(SpeechRecognition, DOMEventTargetHelper) |
115 | | NS_IMPL_RELEASE_INHERITED(SpeechRecognition, DOMEventTargetHelper) |
116 | | |
117 | | SpeechRecognition::SpeechRecognition(nsPIDOMWindowInner* aOwnerWindow) |
118 | | : DOMEventTargetHelper(aOwnerWindow) |
119 | | , mEndpointer(kSAMPLE_RATE) |
120 | | , mAudioSamplesPerChunk(mEndpointer.FrameSize()) |
121 | | , mSpeechDetectionTimer(NS_NewTimer()) |
122 | | , mSpeechGrammarList(new SpeechGrammarList(GetParentObject())) |
123 | | , mInterimResults(false) |
124 | | , mMaxAlternatives(1) |
125 | 0 | { |
126 | 0 | SR_LOG("created SpeechRecognition"); |
127 | 0 |
|
128 | 0 | if (StaticPrefs::MediaWebspeechTestEnable()) { |
129 | 0 | nsCOMPtr<nsIObserverService> obs = services::GetObserverService(); |
130 | 0 | obs->AddObserver(this, SPEECH_RECOGNITION_TEST_EVENT_REQUEST_TOPIC, false); |
131 | 0 | obs->AddObserver(this, SPEECH_RECOGNITION_TEST_END_TOPIC, false); |
132 | 0 | } |
133 | 0 |
|
134 | 0 | mEndpointer.set_speech_input_complete_silence_length( |
135 | 0 | Preferences::GetInt(PREFERENCE_ENDPOINTER_SILENCE_LENGTH, 1250000)); |
136 | 0 | mEndpointer.set_long_speech_input_complete_silence_length( |
137 | 0 | Preferences::GetInt(PREFERENCE_ENDPOINTER_LONG_SILENCE_LENGTH, 2500000)); |
138 | 0 | mEndpointer.set_long_speech_length( |
139 | 0 | Preferences::GetInt(PREFERENCE_ENDPOINTER_SILENCE_LENGTH, 3 * 1000000)); |
140 | 0 | Reset(); |
141 | 0 | } |
142 | | |
143 | | bool |
144 | | SpeechRecognition::StateBetween(FSMState begin, FSMState end) |
145 | 0 | { |
146 | 0 | return mCurrentState >= begin && mCurrentState <= end; |
147 | 0 | } |
148 | | |
149 | | void |
150 | | SpeechRecognition::SetState(FSMState state) |
151 | 0 | { |
152 | 0 | mCurrentState = state; |
153 | 0 | SR_LOG("Transitioned to state %s", GetName(mCurrentState)); |
154 | 0 | } |
155 | | |
156 | | JSObject* |
157 | | SpeechRecognition::WrapObject(JSContext* aCx, JS::Handle<JSObject*> aGivenProto) |
158 | 0 | { |
159 | 0 | return SpeechRecognition_Binding::Wrap(aCx, this, aGivenProto); |
160 | 0 | } |
161 | | |
162 | | bool |
163 | | SpeechRecognition::IsAuthorized(JSContext* aCx, JSObject* aGlobal) |
164 | 0 | { |
165 | 0 | nsCOMPtr<nsIPrincipal> principal = nsContentUtils::ObjectPrincipal(aGlobal); |
166 | 0 |
|
167 | 0 | nsresult rv; |
168 | 0 | nsCOMPtr<nsIPermissionManager> mgr = |
169 | 0 | do_GetService(NS_PERMISSIONMANAGER_CONTRACTID, &rv); |
170 | 0 | if (NS_WARN_IF(NS_FAILED(rv))) { |
171 | 0 | return false; |
172 | 0 | } |
173 | 0 | |
174 | 0 | uint32_t speechRecognition = nsIPermissionManager::UNKNOWN_ACTION; |
175 | 0 | rv = mgr->TestExactPermissionFromPrincipal( |
176 | 0 | principal, "speech-recognition", &speechRecognition); |
177 | 0 | if (NS_WARN_IF(NS_FAILED(rv))) { |
178 | 0 | return false; |
179 | 0 | } |
180 | 0 | |
181 | 0 | bool hasPermission = |
182 | 0 | (speechRecognition == nsIPermissionManager::ALLOW_ACTION); |
183 | 0 |
|
184 | 0 | return (hasPermission || |
185 | 0 | StaticPrefs::MediaWebspeechRecognitionForceEnable() || |
186 | 0 | StaticPrefs::MediaWebspeechTestEnable()) && |
187 | 0 | StaticPrefs::MediaWebspeechRecognitionEnable(); |
188 | 0 | } |
189 | | |
190 | | already_AddRefed<SpeechRecognition> |
191 | | SpeechRecognition::Constructor(const GlobalObject& aGlobal, |
192 | | ErrorResult& aRv) |
193 | 0 | { |
194 | 0 | nsCOMPtr<nsPIDOMWindowInner> win = do_QueryInterface(aGlobal.GetAsSupports()); |
195 | 0 | if (!win) { |
196 | 0 | aRv.Throw(NS_ERROR_FAILURE); |
197 | 0 | return nullptr; |
198 | 0 | } |
199 | 0 | |
200 | 0 | RefPtr<SpeechRecognition> object = new SpeechRecognition(win); |
201 | 0 | return object.forget(); |
202 | 0 | } |
203 | | |
204 | | nsISupports* |
205 | | SpeechRecognition::GetParentObject() const |
206 | 0 | { |
207 | 0 | return GetOwner(); |
208 | 0 | } |
209 | | |
210 | | void |
211 | | SpeechRecognition::ProcessEvent(SpeechEvent* aEvent) |
212 | 0 | { |
213 | 0 | SR_LOG("Processing %s, current state is %s", |
214 | 0 | GetName(aEvent), |
215 | 0 | GetName(mCurrentState)); |
216 | 0 |
|
217 | 0 | if (mAborted && aEvent->mType != EVENT_ABORT) { |
218 | 0 | // ignore all events while aborting |
219 | 0 | return; |
220 | 0 | } |
221 | 0 | |
222 | 0 | Transition(aEvent); |
223 | 0 | } |
224 | | |
225 | | void |
226 | | SpeechRecognition::Transition(SpeechEvent* aEvent) |
227 | 0 | { |
228 | 0 | switch (mCurrentState) { |
229 | 0 | case STATE_IDLE: |
230 | 0 | switch (aEvent->mType) { |
231 | 0 | case EVENT_START: |
232 | 0 | // TODO: may want to time out if we wait too long |
233 | 0 | // for user to approve |
234 | 0 | WaitForAudioData(aEvent); |
235 | 0 | break; |
236 | 0 | case EVENT_STOP: |
237 | 0 | case EVENT_ABORT: |
238 | 0 | case EVENT_AUDIO_DATA: |
239 | 0 | case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT: |
240 | 0 | case EVENT_RECOGNITIONSERVICE_FINAL_RESULT: |
241 | 0 | DoNothing(aEvent); |
242 | 0 | break; |
243 | 0 | case EVENT_AUDIO_ERROR: |
244 | 0 | case EVENT_RECOGNITIONSERVICE_ERROR: |
245 | 0 | AbortError(aEvent); |
246 | 0 | break; |
247 | 0 | case EVENT_COUNT: |
248 | 0 | MOZ_CRASH("Invalid event EVENT_COUNT"); |
249 | 0 | } |
250 | 0 | break; |
251 | 0 | case STATE_STARTING: |
252 | 0 | switch (aEvent->mType) { |
253 | 0 | case EVENT_AUDIO_DATA: |
254 | 0 | StartedAudioCapture(aEvent); |
255 | 0 | break; |
256 | 0 | case EVENT_AUDIO_ERROR: |
257 | 0 | case EVENT_RECOGNITIONSERVICE_ERROR: |
258 | 0 | AbortError(aEvent); |
259 | 0 | break; |
260 | 0 | case EVENT_ABORT: |
261 | 0 | AbortSilently(aEvent); |
262 | 0 | break; |
263 | 0 | case EVENT_STOP: |
264 | 0 | Reset(); |
265 | 0 | break; |
266 | 0 | case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT: |
267 | 0 | case EVENT_RECOGNITIONSERVICE_FINAL_RESULT: |
268 | 0 | DoNothing(aEvent); |
269 | 0 | break; |
270 | 0 | case EVENT_START: |
271 | 0 | SR_LOG("STATE_STARTING: Unhandled event %s", GetName(aEvent)); |
272 | 0 | MOZ_CRASH(); |
273 | 0 | case EVENT_COUNT: |
274 | 0 | MOZ_CRASH("Invalid event EVENT_COUNT"); |
275 | 0 | } |
276 | 0 | break; |
277 | 0 | case STATE_ESTIMATING: |
278 | 0 | switch (aEvent->mType) { |
279 | 0 | case EVENT_AUDIO_DATA: |
280 | 0 | WaitForEstimation(aEvent); |
281 | 0 | break; |
282 | 0 | case EVENT_STOP: |
283 | 0 | StopRecordingAndRecognize(aEvent); |
284 | 0 | break; |
285 | 0 | case EVENT_ABORT: |
286 | 0 | AbortSilently(aEvent); |
287 | 0 | break; |
288 | 0 | case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT: |
289 | 0 | case EVENT_RECOGNITIONSERVICE_FINAL_RESULT: |
290 | 0 | case EVENT_RECOGNITIONSERVICE_ERROR: |
291 | 0 | DoNothing(aEvent); |
292 | 0 | break; |
293 | 0 | case EVENT_AUDIO_ERROR: |
294 | 0 | AbortError(aEvent); |
295 | 0 | break; |
296 | 0 | case EVENT_START: |
297 | 0 | SR_LOG("STATE_ESTIMATING: Unhandled event %d", aEvent->mType); |
298 | 0 | MOZ_CRASH(); |
299 | 0 | case EVENT_COUNT: |
300 | 0 | MOZ_CRASH("Invalid event EVENT_COUNT"); |
301 | 0 | } |
302 | 0 | break; |
303 | 0 | case STATE_WAITING_FOR_SPEECH: |
304 | 0 | switch (aEvent->mType) { |
305 | 0 | case EVENT_AUDIO_DATA: |
306 | 0 | DetectSpeech(aEvent); |
307 | 0 | break; |
308 | 0 | case EVENT_STOP: |
309 | 0 | StopRecordingAndRecognize(aEvent); |
310 | 0 | break; |
311 | 0 | case EVENT_ABORT: |
312 | 0 | AbortSilently(aEvent); |
313 | 0 | break; |
314 | 0 | case EVENT_AUDIO_ERROR: |
315 | 0 | AbortError(aEvent); |
316 | 0 | break; |
317 | 0 | case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT: |
318 | 0 | case EVENT_RECOGNITIONSERVICE_FINAL_RESULT: |
319 | 0 | case EVENT_RECOGNITIONSERVICE_ERROR: |
320 | 0 | DoNothing(aEvent); |
321 | 0 | break; |
322 | 0 | case EVENT_START: |
323 | 0 | SR_LOG("STATE_STARTING: Unhandled event %s", GetName(aEvent)); |
324 | 0 | MOZ_CRASH(); |
325 | 0 | case EVENT_COUNT: |
326 | 0 | MOZ_CRASH("Invalid event EVENT_COUNT"); |
327 | 0 | } |
328 | 0 | break; |
329 | 0 | case STATE_RECOGNIZING: |
330 | 0 | switch (aEvent->mType) { |
331 | 0 | case EVENT_AUDIO_DATA: |
332 | 0 | WaitForSpeechEnd(aEvent); |
333 | 0 | break; |
334 | 0 | case EVENT_STOP: |
335 | 0 | StopRecordingAndRecognize(aEvent); |
336 | 0 | break; |
337 | 0 | case EVENT_AUDIO_ERROR: |
338 | 0 | case EVENT_RECOGNITIONSERVICE_ERROR: |
339 | 0 | AbortError(aEvent); |
340 | 0 | break; |
341 | 0 | case EVENT_ABORT: |
342 | 0 | AbortSilently(aEvent); |
343 | 0 | break; |
344 | 0 | case EVENT_RECOGNITIONSERVICE_FINAL_RESULT: |
345 | 0 | case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT: |
346 | 0 | DoNothing(aEvent); |
347 | 0 | break; |
348 | 0 | case EVENT_START: |
349 | 0 | SR_LOG("STATE_RECOGNIZING: Unhandled aEvent %s", GetName(aEvent)); |
350 | 0 | MOZ_CRASH(); |
351 | 0 | case EVENT_COUNT: |
352 | 0 | MOZ_CRASH("Invalid event EVENT_COUNT"); |
353 | 0 | } |
354 | 0 | break; |
355 | 0 | case STATE_WAITING_FOR_RESULT: |
356 | 0 | switch (aEvent->mType) { |
357 | 0 | case EVENT_STOP: |
358 | 0 | DoNothing(aEvent); |
359 | 0 | break; |
360 | 0 | case EVENT_AUDIO_ERROR: |
361 | 0 | case EVENT_RECOGNITIONSERVICE_ERROR: |
362 | 0 | AbortError(aEvent); |
363 | 0 | break; |
364 | 0 | case EVENT_RECOGNITIONSERVICE_FINAL_RESULT: |
365 | 0 | NotifyFinalResult(aEvent); |
366 | 0 | break; |
367 | 0 | case EVENT_AUDIO_DATA: |
368 | 0 | DoNothing(aEvent); |
369 | 0 | break; |
370 | 0 | case EVENT_ABORT: |
371 | 0 | AbortSilently(aEvent); |
372 | 0 | break; |
373 | 0 | case EVENT_START: |
374 | 0 | case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT: |
375 | 0 | SR_LOG("STATE_WAITING_FOR_RESULT: Unhandled aEvent %s", |
376 | 0 | GetName(aEvent)); |
377 | 0 | MOZ_CRASH(); |
378 | 0 | case EVENT_COUNT: |
379 | 0 | MOZ_CRASH("Invalid event EVENT_COUNT"); |
380 | 0 | } |
381 | 0 | break; |
382 | 0 | case STATE_COUNT: |
383 | 0 | MOZ_CRASH("Invalid state STATE_COUNT"); |
384 | 0 | } |
385 | 0 | } |
386 | | |
387 | | /* |
388 | | * Handle a segment of recorded audio data. |
389 | | * Returns the number of samples that were processed. |
390 | | */ |
391 | | uint32_t |
392 | | SpeechRecognition::ProcessAudioSegment(AudioSegment* aSegment, |
393 | | TrackRate aTrackRate) |
394 | 0 | { |
395 | 0 | AudioSegment::ChunkIterator iterator(*aSegment); |
396 | 0 | uint32_t samples = 0; |
397 | 0 | while (!iterator.IsEnded()) { |
398 | 0 | float out; |
399 | 0 | mEndpointer.ProcessAudio(*iterator, &out); |
400 | 0 | samples += iterator->GetDuration(); |
401 | 0 | iterator.Next(); |
402 | 0 | } |
403 | 0 |
|
404 | 0 | mRecognitionService->ProcessAudioSegment(aSegment, aTrackRate); |
405 | 0 | return samples; |
406 | 0 | } |
407 | | |
408 | | /**************************************************************************** |
409 | | * FSM Transition functions |
410 | | * |
411 | | * If a transition function may cause a DOM event to be fired, |
412 | | * it may also be re-entered, since the event handler may cause the |
413 | | * event loop to spin and new SpeechEvents to be processed. |
414 | | * |
415 | | * Rules: |
416 | | * 1) These methods should call SetState as soon as possible. |
417 | | * 2) If these methods dispatch DOM events, or call methods that dispatch |
418 | | * DOM events, that should be done as late as possible. |
419 | | * 3) If anything must happen after dispatching a DOM event, make sure |
420 | | * the state is still what the method expected it to be. |
421 | | ****************************************************************************/ |
422 | | |
423 | | void |
424 | | SpeechRecognition::Reset() |
425 | 0 | { |
426 | 0 | SetState(STATE_IDLE); |
427 | 0 | mRecognitionService = nullptr; |
428 | 0 | mEstimationSamples = 0; |
429 | 0 | mBufferedSamples = 0; |
430 | 0 | mSpeechDetectionTimer->Cancel(); |
431 | 0 | mAborted = false; |
432 | 0 | } |
433 | | |
434 | | void |
435 | | SpeechRecognition::ResetAndEnd() |
436 | 0 | { |
437 | 0 | Reset(); |
438 | 0 | DispatchTrustedEvent(NS_LITERAL_STRING("end")); |
439 | 0 | } |
440 | | |
441 | | void |
442 | | SpeechRecognition::WaitForAudioData(SpeechEvent* aEvent) |
443 | 0 | { |
444 | 0 | SetState(STATE_STARTING); |
445 | 0 | } |
446 | | |
447 | | void |
448 | | SpeechRecognition::StartedAudioCapture(SpeechEvent* aEvent) |
449 | 0 | { |
450 | 0 | SetState(STATE_ESTIMATING); |
451 | 0 |
|
452 | 0 | mEndpointer.SetEnvironmentEstimationMode(); |
453 | 0 | mEstimationSamples += |
454 | 0 | ProcessAudioSegment(aEvent->mAudioSegment, aEvent->mTrackRate); |
455 | 0 |
|
456 | 0 | DispatchTrustedEvent(NS_LITERAL_STRING("audiostart")); |
457 | 0 | if (mCurrentState == STATE_ESTIMATING) { |
458 | 0 | DispatchTrustedEvent(NS_LITERAL_STRING("start")); |
459 | 0 | } |
460 | 0 | } |
461 | | |
462 | | void |
463 | | SpeechRecognition::StopRecordingAndRecognize(SpeechEvent* aEvent) |
464 | 0 | { |
465 | 0 | SetState(STATE_WAITING_FOR_RESULT); |
466 | 0 |
|
467 | 0 | MOZ_ASSERT(mRecognitionService, "Service deleted before recording done"); |
468 | 0 | mRecognitionService->SoundEnd(); |
469 | 0 |
|
470 | 0 | StopRecording(); |
471 | 0 | } |
472 | | |
473 | | void |
474 | | SpeechRecognition::WaitForEstimation(SpeechEvent* aEvent) |
475 | 0 | { |
476 | 0 | SetState(STATE_ESTIMATING); |
477 | 0 |
|
478 | 0 | mEstimationSamples += |
479 | 0 | ProcessAudioSegment(aEvent->mAudioSegment, aEvent->mTrackRate); |
480 | 0 | if (mEstimationSamples > kESTIMATION_SAMPLES) { |
481 | 0 | mEndpointer.SetUserInputMode(); |
482 | 0 | SetState(STATE_WAITING_FOR_SPEECH); |
483 | 0 | } |
484 | 0 | } |
485 | | |
486 | | void |
487 | | SpeechRecognition::DetectSpeech(SpeechEvent* aEvent) |
488 | 0 | { |
489 | 0 | SetState(STATE_WAITING_FOR_SPEECH); |
490 | 0 |
|
491 | 0 | ProcessAudioSegment(aEvent->mAudioSegment, aEvent->mTrackRate); |
492 | 0 | if (mEndpointer.DidStartReceivingSpeech()) { |
493 | 0 | mSpeechDetectionTimer->Cancel(); |
494 | 0 | SetState(STATE_RECOGNIZING); |
495 | 0 | DispatchTrustedEvent(NS_LITERAL_STRING("speechstart")); |
496 | 0 | } |
497 | 0 | } |
498 | | |
499 | | void |
500 | | SpeechRecognition::WaitForSpeechEnd(SpeechEvent* aEvent) |
501 | 0 | { |
502 | 0 | SetState(STATE_RECOGNIZING); |
503 | 0 |
|
504 | 0 | ProcessAudioSegment(aEvent->mAudioSegment, aEvent->mTrackRate); |
505 | 0 | if (mEndpointer.speech_input_complete()) { |
506 | 0 | DispatchTrustedEvent(NS_LITERAL_STRING("speechend")); |
507 | 0 |
|
508 | 0 | if (mCurrentState == STATE_RECOGNIZING) { |
509 | 0 | // FIXME: StopRecordingAndRecognize should only be called for single |
510 | 0 | // shot services for continuous we should just inform the service |
511 | 0 | StopRecordingAndRecognize(aEvent); |
512 | 0 | } |
513 | 0 | } |
514 | 0 | } |
515 | | |
516 | | void |
517 | | SpeechRecognition::NotifyFinalResult(SpeechEvent* aEvent) |
518 | 0 | { |
519 | 0 | ResetAndEnd(); |
520 | 0 |
|
521 | 0 | RootedDictionary<SpeechRecognitionEventInit> init(RootingCx()); |
522 | 0 | init.mBubbles = true; |
523 | 0 | init.mCancelable = false; |
524 | 0 | // init.mResultIndex = 0; |
525 | 0 | init.mResults = aEvent->mRecognitionResultList; |
526 | 0 | init.mInterpretation = JS::NullValue(); |
527 | 0 | // init.mEmma = nullptr; |
528 | 0 |
|
529 | 0 | RefPtr<SpeechRecognitionEvent> event = SpeechRecognitionEvent::Constructor( |
530 | 0 | this, NS_LITERAL_STRING("result"), init); |
531 | 0 | event->SetTrusted(true); |
532 | 0 |
|
533 | 0 | DispatchEvent(*event); |
534 | 0 | } |
535 | | |
536 | | void |
537 | | SpeechRecognition::DoNothing(SpeechEvent* aEvent) |
538 | 0 | { |
539 | 0 | } |
540 | | |
541 | | void |
542 | | SpeechRecognition::AbortSilently(SpeechEvent* aEvent) |
543 | 0 | { |
544 | 0 | if (mRecognitionService) { |
545 | 0 | mRecognitionService->Abort(); |
546 | 0 | } |
547 | 0 |
|
548 | 0 | if (mDOMStream) { |
549 | 0 | StopRecording(); |
550 | 0 | } |
551 | 0 |
|
552 | 0 | ResetAndEnd(); |
553 | 0 | } |
554 | | |
555 | | void |
556 | | SpeechRecognition::AbortError(SpeechEvent* aEvent) |
557 | 0 | { |
558 | 0 | AbortSilently(aEvent); |
559 | 0 | NotifyError(aEvent); |
560 | 0 | } |
561 | | |
562 | | void |
563 | | SpeechRecognition::NotifyError(SpeechEvent* aEvent) |
564 | 0 | { |
565 | 0 | aEvent->mError->SetTrusted(true); |
566 | 0 |
|
567 | 0 | DispatchEvent(*aEvent->mError); |
568 | 0 | } |
569 | | |
570 | | /************************************** |
571 | | * Event triggers and other functions * |
572 | | **************************************/ |
573 | | NS_IMETHODIMP |
574 | | SpeechRecognition::StartRecording(DOMMediaStream* aDOMStream) |
575 | 0 | { |
576 | 0 | // hold a reference so that the underlying stream |
577 | 0 | // doesn't get Destroy()'ed |
578 | 0 | mDOMStream = aDOMStream; |
579 | 0 |
|
580 | 0 | if (NS_WARN_IF(!mDOMStream->GetPlaybackStream())) { |
581 | 0 | return NS_ERROR_UNEXPECTED; |
582 | 0 | } |
583 | 0 | mSpeechListener = new SpeechStreamListener(this); |
584 | 0 | mDOMStream->GetPlaybackStream()->AddListener(mSpeechListener); |
585 | 0 |
|
586 | 0 | mEndpointer.StartSession(); |
587 | 0 |
|
588 | 0 | return mSpeechDetectionTimer->Init(this, kSPEECH_DETECTION_TIMEOUT_MS, |
589 | 0 | nsITimer::TYPE_ONE_SHOT); |
590 | 0 | } |
591 | | |
592 | | NS_IMETHODIMP |
593 | | SpeechRecognition::StopRecording() |
594 | 0 | { |
595 | 0 | // we only really need to remove the listener explicitly when testing, |
596 | 0 | // as our JS code still holds a reference to mDOMStream and only assigning |
597 | 0 | // it to nullptr isn't guaranteed to free the stream and the listener. |
598 | 0 | mDOMStream->GetPlaybackStream()->RemoveListener(mSpeechListener); |
599 | 0 | mSpeechListener = nullptr; |
600 | 0 | mDOMStream = nullptr; |
601 | 0 |
|
602 | 0 | mEndpointer.EndSession(); |
603 | 0 | DispatchTrustedEvent(NS_LITERAL_STRING("audioend")); |
604 | 0 |
|
605 | 0 | return NS_OK; |
606 | 0 | } |
607 | | |
608 | | NS_IMETHODIMP |
609 | | SpeechRecognition::Observe(nsISupports* aSubject, const char* aTopic, |
610 | | const char16_t* aData) |
611 | 0 | { |
612 | 0 | MOZ_ASSERT(NS_IsMainThread(), "Observer invoked off the main thread"); |
613 | 0 |
|
614 | 0 | if (!strcmp(aTopic, NS_TIMER_CALLBACK_TOPIC) && |
615 | 0 | StateBetween(STATE_IDLE, STATE_WAITING_FOR_SPEECH)) { |
616 | 0 |
|
617 | 0 | DispatchError(SpeechRecognition::EVENT_AUDIO_ERROR, |
618 | 0 | SpeechRecognitionErrorCode::No_speech, |
619 | 0 | NS_LITERAL_STRING("No speech detected (timeout)")); |
620 | 0 | } else if (!strcmp(aTopic, SPEECH_RECOGNITION_TEST_END_TOPIC)) { |
621 | 0 | nsCOMPtr<nsIObserverService> obs = services::GetObserverService(); |
622 | 0 | obs->RemoveObserver(this, SPEECH_RECOGNITION_TEST_EVENT_REQUEST_TOPIC); |
623 | 0 | obs->RemoveObserver(this, SPEECH_RECOGNITION_TEST_END_TOPIC); |
624 | 0 | } else if (StaticPrefs::MediaWebspeechTextFakeFsmEvents() && |
625 | 0 | !strcmp(aTopic, SPEECH_RECOGNITION_TEST_EVENT_REQUEST_TOPIC)) { |
626 | 0 | ProcessTestEventRequest(aSubject, nsDependentString(aData)); |
627 | 0 | } |
628 | 0 |
|
629 | 0 | return NS_OK; |
630 | 0 | } |
631 | | |
632 | | void |
633 | | SpeechRecognition::ProcessTestEventRequest(nsISupports* aSubject, |
634 | | const nsAString& aEventName) |
635 | 0 | { |
636 | 0 | if (aEventName.EqualsLiteral("EVENT_ABORT")) { |
637 | 0 | Abort(); |
638 | 0 | } else if (aEventName.EqualsLiteral("EVENT_AUDIO_ERROR")) { |
639 | 0 | DispatchError( |
640 | 0 | SpeechRecognition::EVENT_AUDIO_ERROR, |
641 | 0 | SpeechRecognitionErrorCode::Audio_capture, // TODO different codes? |
642 | 0 | NS_LITERAL_STRING("AUDIO_ERROR test event")); |
643 | 0 | } else { |
644 | 0 | NS_ASSERTION(StaticPrefs::MediaWebspeechTextFakeRecognitionService(), |
645 | 0 | "Got request for fake recognition service event, but " |
646 | 0 | "media.webspeech.test.fake_recognition_service is unset"); |
647 | 0 |
|
648 | 0 | // let the fake recognition service handle the request |
649 | 0 | } |
650 | 0 | } |
651 | | |
652 | | already_AddRefed<SpeechGrammarList> |
653 | | SpeechRecognition::Grammars() const |
654 | 0 | { |
655 | 0 | RefPtr<SpeechGrammarList> speechGrammarList = mSpeechGrammarList; |
656 | 0 | return speechGrammarList.forget(); |
657 | 0 | } |
658 | | |
659 | | void |
660 | | SpeechRecognition::SetGrammars(SpeechGrammarList& aArg) |
661 | 0 | { |
662 | 0 | mSpeechGrammarList = &aArg; |
663 | 0 | } |
664 | | |
665 | | void |
666 | | SpeechRecognition::GetLang(nsString& aRetVal) const |
667 | 0 | { |
668 | 0 | aRetVal = mLang; |
669 | 0 | } |
670 | | |
671 | | void |
672 | | SpeechRecognition::SetLang(const nsAString& aArg) |
673 | 0 | { |
674 | 0 | mLang = aArg; |
675 | 0 | } |
676 | | |
677 | | bool |
678 | | SpeechRecognition::GetContinuous(ErrorResult& aRv) const |
679 | 0 | { |
680 | 0 | aRv.Throw(NS_ERROR_NOT_IMPLEMENTED); |
681 | 0 | return false; |
682 | 0 | } |
683 | | |
684 | | void |
685 | | SpeechRecognition::SetContinuous(bool aArg, ErrorResult& aRv) |
686 | 0 | { |
687 | 0 | aRv.Throw(NS_ERROR_NOT_IMPLEMENTED); |
688 | 0 | } |
689 | | |
690 | | bool |
691 | | SpeechRecognition::InterimResults() const |
692 | 0 | { |
693 | 0 | return mInterimResults; |
694 | 0 | } |
695 | | |
696 | | void |
697 | | SpeechRecognition::SetInterimResults(bool aArg) |
698 | 0 | { |
699 | 0 | mInterimResults = aArg; |
700 | 0 | } |
701 | | |
702 | | uint32_t |
703 | | SpeechRecognition::MaxAlternatives() const |
704 | 0 | { |
705 | 0 | return mMaxAlternatives; |
706 | 0 | } |
707 | | |
708 | | void |
709 | | SpeechRecognition::SetMaxAlternatives(uint32_t aArg) |
710 | 0 | { |
711 | 0 | mMaxAlternatives = aArg; |
712 | 0 | } |
713 | | |
714 | | void |
715 | | SpeechRecognition::GetServiceURI(nsString& aRetVal, ErrorResult& aRv) const |
716 | 0 | { |
717 | 0 | aRv.Throw(NS_ERROR_NOT_IMPLEMENTED); |
718 | 0 | } |
719 | | |
720 | | void |
721 | | SpeechRecognition::SetServiceURI(const nsAString& aArg, ErrorResult& aRv) |
722 | 0 | { |
723 | 0 | aRv.Throw(NS_ERROR_NOT_IMPLEMENTED); |
724 | 0 | } |
725 | | |
726 | | void |
727 | | SpeechRecognition::Start(const Optional<NonNull<DOMMediaStream>>& aStream, |
728 | | CallerType aCallerType, |
729 | | ErrorResult& aRv) |
730 | 0 | { |
731 | 0 | if (mCurrentState != STATE_IDLE) { |
732 | 0 | aRv.Throw(NS_ERROR_DOM_INVALID_STATE_ERR); |
733 | 0 | return; |
734 | 0 | } |
735 | 0 | |
736 | 0 | if (!SetRecognitionService(aRv)) { |
737 | 0 | return; |
738 | 0 | } |
739 | 0 | |
740 | 0 | if (!ValidateAndSetGrammarList(aRv)) { |
741 | 0 | return; |
742 | 0 | } |
743 | 0 | |
744 | 0 | nsresult rv; |
745 | 0 | rv = mRecognitionService->Initialize(this); |
746 | 0 | if (NS_WARN_IF(NS_FAILED(rv))) { |
747 | 0 | return; |
748 | 0 | } |
749 | 0 | |
750 | 0 | MediaStreamConstraints constraints; |
751 | 0 | constraints.mAudio.SetAsBoolean() = true; |
752 | 0 |
|
753 | 0 | if (aStream.WasPassed()) { |
754 | 0 | StartRecording(&aStream.Value()); |
755 | 0 | } else { |
756 | 0 | AutoNoJSAPI nojsapi; |
757 | 0 | MediaManager* manager = MediaManager::Get(); |
758 | 0 | MediaManager::GetUserMediaSuccessCallback onsuccess( |
759 | 0 | new GetUserMediaSuccessCallback(this)); |
760 | 0 | MediaManager::GetUserMediaErrorCallback onerror( |
761 | 0 | new GetUserMediaErrorCallback(this)); |
762 | 0 | manager->GetUserMedia(GetOwner(), |
763 | 0 | constraints, |
764 | 0 | std::move(onsuccess), |
765 | 0 | std::move(onerror), |
766 | 0 | aCallerType); |
767 | 0 | } |
768 | 0 |
|
769 | 0 | RefPtr<SpeechEvent> event = new SpeechEvent(this, EVENT_START); |
770 | 0 | NS_DispatchToMainThread(event); |
771 | 0 | } |
772 | | |
773 | | bool |
774 | | SpeechRecognition::SetRecognitionService(ErrorResult& aRv) |
775 | 0 | { |
776 | 0 | // See: https://dvcs.w3.org/hg/speech-api/raw-file/tip/webspeechapi.html#dfn-lang |
777 | 0 | if (!mLang.IsEmpty()) { |
778 | 0 | mRecognitionService = GetSpeechRecognitionService(mLang); |
779 | 0 |
|
780 | 0 | if (!mRecognitionService) { |
781 | 0 | aRv.Throw(NS_ERROR_DOM_INVALID_STATE_ERR); |
782 | 0 | return false; |
783 | 0 | } |
784 | 0 | |
785 | 0 | return true; |
786 | 0 | } |
787 | 0 | |
788 | 0 | nsCOMPtr<nsPIDOMWindowInner> window = GetOwner(); |
789 | 0 | if(!window) { |
790 | 0 | aRv.Throw(NS_ERROR_DOM_INVALID_STATE_ERR); |
791 | 0 | return false; |
792 | 0 | } |
793 | 0 | nsCOMPtr<nsIDocument> document = window->GetExtantDoc(); |
794 | 0 | if(!document) { |
795 | 0 | aRv.Throw(NS_ERROR_DOM_INVALID_STATE_ERR); |
796 | 0 | return false; |
797 | 0 | } |
798 | 0 | nsCOMPtr<Element> element = document->GetRootElement(); |
799 | 0 | if(!element) { |
800 | 0 | aRv.Throw(NS_ERROR_DOM_INVALID_STATE_ERR); |
801 | 0 | return false; |
802 | 0 | } |
803 | 0 | |
804 | 0 | nsAutoString lang; |
805 | 0 | element->GetLang(lang); |
806 | 0 | mRecognitionService = GetSpeechRecognitionService(lang); |
807 | 0 |
|
808 | 0 | if (!mRecognitionService) { |
809 | 0 | aRv.Throw(NS_ERROR_DOM_INVALID_STATE_ERR); |
810 | 0 | return false; |
811 | 0 | } |
812 | 0 | |
813 | 0 | return true; |
814 | 0 | } |
815 | | |
816 | | bool |
817 | | SpeechRecognition::ValidateAndSetGrammarList(ErrorResult& aRv) |
818 | 0 | { |
819 | 0 | if (!mSpeechGrammarList) { |
820 | 0 | aRv.Throw(NS_ERROR_DOM_INVALID_STATE_ERR); |
821 | 0 | return false; |
822 | 0 | } |
823 | 0 | |
824 | 0 | uint32_t grammarListLength = mSpeechGrammarList->Length(); |
825 | 0 | if (0 == grammarListLength) { |
826 | 0 | aRv.Throw(NS_ERROR_DOM_INVALID_STATE_ERR); |
827 | 0 | return false; |
828 | 0 | } |
829 | 0 | |
830 | 0 | for (uint32_t count = 0; count < grammarListLength; ++count) { |
831 | 0 | RefPtr<SpeechGrammar> speechGrammar = mSpeechGrammarList->Item(count, aRv); |
832 | 0 | if (aRv.Failed()) { |
833 | 0 | return false; |
834 | 0 | } |
835 | 0 | if (NS_FAILED(mRecognitionService->ValidateAndSetGrammarList( |
836 | 0 | speechGrammar.get(), nullptr))) { |
837 | 0 | aRv.Throw(NS_ERROR_DOM_INVALID_STATE_ERR); |
838 | 0 | return false; |
839 | 0 | } |
840 | 0 | } |
841 | 0 |
|
842 | 0 | return true; |
843 | 0 | } |
844 | | |
845 | | void |
846 | | SpeechRecognition::Stop() |
847 | 0 | { |
848 | 0 | RefPtr<SpeechEvent> event = new SpeechEvent(this, EVENT_STOP); |
849 | 0 | NS_DispatchToMainThread(event); |
850 | 0 | } |
851 | | |
852 | | void |
853 | | SpeechRecognition::Abort() |
854 | 0 | { |
855 | 0 | if (mAborted) { |
856 | 0 | return; |
857 | 0 | } |
858 | 0 | |
859 | 0 | mAborted = true; |
860 | 0 | RefPtr<SpeechEvent> event = new SpeechEvent(this, EVENT_ABORT); |
861 | 0 | NS_DispatchToMainThread(event); |
862 | 0 | } |
863 | | |
864 | | void |
865 | | SpeechRecognition::DispatchError(EventType aErrorType, |
866 | | SpeechRecognitionErrorCode aErrorCode, |
867 | | const nsAString& aMessage) |
868 | 0 | { |
869 | 0 | MOZ_ASSERT(NS_IsMainThread()); |
870 | 0 | MOZ_ASSERT(aErrorType == EVENT_RECOGNITIONSERVICE_ERROR || |
871 | 0 | aErrorType == EVENT_AUDIO_ERROR, "Invalid error type!"); |
872 | 0 |
|
873 | 0 | RefPtr<SpeechRecognitionError> srError = |
874 | 0 | new SpeechRecognitionError(nullptr, nullptr, nullptr); |
875 | 0 |
|
876 | 0 | srError->InitSpeechRecognitionError(NS_LITERAL_STRING("error"), true, false, |
877 | 0 | aErrorCode, aMessage); |
878 | 0 |
|
879 | 0 | RefPtr<SpeechEvent> event = new SpeechEvent(this, aErrorType); |
880 | 0 | event->mError = srError; |
881 | 0 | NS_DispatchToMainThread(event); |
882 | 0 | } |
883 | | |
884 | | /* |
885 | | * Buffer audio samples into mAudioSamplesBuffer until aBufferSize. |
886 | | * Updates mBufferedSamples and returns the number of samples that were buffered. |
887 | | */ |
888 | | uint32_t |
889 | | SpeechRecognition::FillSamplesBuffer(const int16_t* aSamples, |
890 | | uint32_t aSampleCount) |
891 | 0 | { |
892 | 0 | MOZ_ASSERT(mBufferedSamples < mAudioSamplesPerChunk); |
893 | 0 | MOZ_ASSERT(mAudioSamplesBuffer.get()); |
894 | 0 |
|
895 | 0 | int16_t* samplesBuffer = static_cast<int16_t*>(mAudioSamplesBuffer->Data()); |
896 | 0 | size_t samplesToCopy = std::min(aSampleCount, |
897 | 0 | mAudioSamplesPerChunk - mBufferedSamples); |
898 | 0 |
|
899 | 0 | memcpy(samplesBuffer + mBufferedSamples, aSamples, |
900 | 0 | samplesToCopy * sizeof(int16_t)); |
901 | 0 |
|
902 | 0 | mBufferedSamples += samplesToCopy; |
903 | 0 | return samplesToCopy; |
904 | 0 | } |
905 | | |
906 | | /* |
907 | | * Split a samples buffer starting of a given size into |
908 | | * chunks of equal size. The chunks are stored in the array |
909 | | * received as argument. |
910 | | * Returns the offset of the end of the last chunk that was |
911 | | * created. |
912 | | */ |
913 | | uint32_t |
914 | | SpeechRecognition::SplitSamplesBuffer(const int16_t* aSamplesBuffer, |
915 | | uint32_t aSampleCount, |
916 | | nsTArray<RefPtr<SharedBuffer>>& aResult) |
917 | 0 | { |
918 | 0 | uint32_t chunkStart = 0; |
919 | 0 |
|
920 | 0 | while (chunkStart + mAudioSamplesPerChunk <= aSampleCount) { |
921 | 0 | RefPtr<SharedBuffer> chunk = |
922 | 0 | SharedBuffer::Create(mAudioSamplesPerChunk * sizeof(int16_t)); |
923 | 0 |
|
924 | 0 | memcpy(chunk->Data(), aSamplesBuffer + chunkStart, |
925 | 0 | mAudioSamplesPerChunk * sizeof(int16_t)); |
926 | 0 |
|
927 | 0 | aResult.AppendElement(chunk.forget()); |
928 | 0 | chunkStart += mAudioSamplesPerChunk; |
929 | 0 | } |
930 | 0 |
|
931 | 0 | return chunkStart; |
932 | 0 | } |
933 | | |
934 | | AudioSegment* |
935 | | SpeechRecognition::CreateAudioSegment(nsTArray<RefPtr<SharedBuffer>>& aChunks) |
936 | 0 | { |
937 | 0 | AudioSegment* segment = new AudioSegment(); |
938 | 0 | for (uint32_t i = 0; i < aChunks.Length(); ++i) { |
939 | 0 | RefPtr<SharedBuffer> buffer = aChunks[i]; |
940 | 0 | const int16_t* chunkData = static_cast<const int16_t*>(buffer->Data()); |
941 | 0 |
|
942 | 0 | AutoTArray<const int16_t*, 1> channels; |
943 | 0 | channels.AppendElement(chunkData); |
944 | 0 | segment->AppendFrames(buffer.forget(), channels, mAudioSamplesPerChunk, |
945 | 0 | PRINCIPAL_HANDLE_NONE); |
946 | 0 | } |
947 | 0 |
|
948 | 0 | return segment; |
949 | 0 | } |
950 | | |
951 | | void |
952 | | SpeechRecognition::FeedAudioData(already_AddRefed<SharedBuffer> aSamples, |
953 | | uint32_t aDuration, |
954 | | MediaStreamListener* aProvider, |
955 | | TrackRate aTrackRate) |
956 | 0 | { |
957 | 0 | NS_ASSERTION(!NS_IsMainThread(), |
958 | 0 | "FeedAudioData should not be called in the main thread"); |
959 | 0 |
|
960 | 0 | // Endpointer expects to receive samples in chunks whose size is a |
961 | 0 | // multiple of its frame size. |
962 | 0 | // Since we can't assume we will receive the frames in appropriate-sized |
963 | 0 | // chunks, we must buffer and split them in chunks of mAudioSamplesPerChunk |
964 | 0 | // (a multiple of Endpointer's frame size) before feeding to Endpointer. |
965 | 0 |
|
966 | 0 | // ensure aSamples is deleted |
967 | 0 | RefPtr<SharedBuffer> refSamples = aSamples; |
968 | 0 |
|
969 | 0 | uint32_t samplesIndex = 0; |
970 | 0 | const int16_t* samples = static_cast<int16_t*>(refSamples->Data()); |
971 | 0 | AutoTArray<RefPtr<SharedBuffer>, 5> chunksToSend; |
972 | 0 |
|
973 | 0 | // fill up our buffer and make a chunk out of it, if possible |
974 | 0 | if (mBufferedSamples > 0) { |
975 | 0 | samplesIndex += FillSamplesBuffer(samples, aDuration); |
976 | 0 |
|
977 | 0 | if (mBufferedSamples == mAudioSamplesPerChunk) { |
978 | 0 | chunksToSend.AppendElement(mAudioSamplesBuffer.forget()); |
979 | 0 | mBufferedSamples = 0; |
980 | 0 | } |
981 | 0 | } |
982 | 0 |
|
983 | 0 | // create sample chunks of correct size |
984 | 0 | if (samplesIndex < aDuration) { |
985 | 0 | samplesIndex += SplitSamplesBuffer(samples + samplesIndex, |
986 | 0 | aDuration - samplesIndex, |
987 | 0 | chunksToSend); |
988 | 0 | } |
989 | 0 |
|
990 | 0 | // buffer remaining samples |
991 | 0 | if (samplesIndex < aDuration) { |
992 | 0 | mBufferedSamples = 0; |
993 | 0 | mAudioSamplesBuffer = |
994 | 0 | SharedBuffer::Create(mAudioSamplesPerChunk * sizeof(int16_t)); |
995 | 0 |
|
996 | 0 | FillSamplesBuffer(samples + samplesIndex, aDuration - samplesIndex); |
997 | 0 | } |
998 | 0 |
|
999 | 0 | AudioSegment* segment = CreateAudioSegment(chunksToSend); |
1000 | 0 | RefPtr<SpeechEvent> event = new SpeechEvent(this, EVENT_AUDIO_DATA); |
1001 | 0 | event->mAudioSegment = segment; |
1002 | 0 | event->mProvider = aProvider; |
1003 | 0 | event->mTrackRate = aTrackRate; |
1004 | 0 | NS_DispatchToMainThread(event); |
1005 | 0 | } |
1006 | | |
1007 | | const char* |
1008 | | SpeechRecognition::GetName(FSMState aId) |
1009 | 0 | { |
1010 | 0 | static const char* names[] = { |
1011 | 0 | "STATE_IDLE", |
1012 | 0 | "STATE_STARTING", |
1013 | 0 | "STATE_ESTIMATING", |
1014 | 0 | "STATE_WAITING_FOR_SPEECH", |
1015 | 0 | "STATE_RECOGNIZING", |
1016 | 0 | "STATE_WAITING_FOR_RESULT", |
1017 | 0 | }; |
1018 | 0 |
|
1019 | 0 | MOZ_ASSERT(aId < STATE_COUNT); |
1020 | 0 | MOZ_ASSERT(ArrayLength(names) == STATE_COUNT); |
1021 | 0 | return names[aId]; |
1022 | 0 | } |
1023 | | |
1024 | | const char* |
1025 | | SpeechRecognition::GetName(SpeechEvent* aEvent) |
1026 | 0 | { |
1027 | 0 | static const char* names[] = { |
1028 | 0 | "EVENT_START", |
1029 | 0 | "EVENT_STOP", |
1030 | 0 | "EVENT_ABORT", |
1031 | 0 | "EVENT_AUDIO_DATA", |
1032 | 0 | "EVENT_AUDIO_ERROR", |
1033 | 0 | "EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT", |
1034 | 0 | "EVENT_RECOGNITIONSERVICE_FINAL_RESULT", |
1035 | 0 | "EVENT_RECOGNITIONSERVICE_ERROR" |
1036 | 0 | }; |
1037 | 0 |
|
1038 | 0 | MOZ_ASSERT(aEvent->mType < EVENT_COUNT); |
1039 | 0 | MOZ_ASSERT(ArrayLength(names) == EVENT_COUNT); |
1040 | 0 | return names[aEvent->mType]; |
1041 | 0 | } |
1042 | | |
1043 | | SpeechEvent::~SpeechEvent() |
1044 | 0 | { |
1045 | 0 | delete mAudioSegment; |
1046 | 0 | } |
1047 | | |
1048 | | NS_IMETHODIMP |
1049 | | SpeechEvent::Run() |
1050 | 0 | { |
1051 | 0 | mRecognition->ProcessEvent(this); |
1052 | 0 | return NS_OK; |
1053 | 0 | } |
1054 | | |
1055 | | NS_IMPL_ISUPPORTS(SpeechRecognition::GetUserMediaSuccessCallback, |
1056 | | nsIDOMGetUserMediaSuccessCallback) |
1057 | | |
1058 | | NS_IMETHODIMP |
1059 | | SpeechRecognition::GetUserMediaSuccessCallback::OnSuccess(nsISupports* aStream) |
1060 | 0 | { |
1061 | 0 | RefPtr<DOMMediaStream> stream = do_QueryObject(aStream); |
1062 | 0 | if (!stream) { |
1063 | 0 | return NS_ERROR_NO_INTERFACE; |
1064 | 0 | } |
1065 | 0 | mRecognition->StartRecording(stream); |
1066 | 0 | return NS_OK; |
1067 | 0 | } |
1068 | | |
1069 | | NS_IMPL_ISUPPORTS(SpeechRecognition::GetUserMediaErrorCallback, |
1070 | | nsIDOMGetUserMediaErrorCallback) |
1071 | | |
1072 | | NS_IMETHODIMP |
1073 | | SpeechRecognition::GetUserMediaErrorCallback::OnError(nsISupports* aError) |
1074 | 0 | { |
1075 | 0 | RefPtr<MediaStreamError> error = do_QueryObject(aError); |
1076 | 0 | if (!error) { |
1077 | 0 | return NS_OK; |
1078 | 0 | } |
1079 | 0 | SpeechRecognitionErrorCode errorCode; |
1080 | 0 |
|
1081 | 0 | nsAutoString name; |
1082 | 0 | error->GetName(name); |
1083 | 0 | if (name.EqualsLiteral("PERMISSION_DENIED")) { |
1084 | 0 | errorCode = SpeechRecognitionErrorCode::Not_allowed; |
1085 | 0 | } else { |
1086 | 0 | errorCode = SpeechRecognitionErrorCode::Audio_capture; |
1087 | 0 | } |
1088 | 0 |
|
1089 | 0 | nsAutoString message; |
1090 | 0 | error->GetMessage(message); |
1091 | 0 | mRecognition->DispatchError(SpeechRecognition::EVENT_AUDIO_ERROR, errorCode, |
1092 | 0 | message); |
1093 | 0 | return NS_OK; |
1094 | 0 | } |
1095 | | |
1096 | | } // namespace dom |
1097 | | } // namespace mozilla |