Coverage Report

Created: 2018-09-25 14:53

/src/mozilla-central/dom/media/webspeech/recognition/SpeechRecognition.cpp
Line
Count
Source (jump to first uncovered line)
1
/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2
/* vim:set ts=2 sw=2 sts=2 et cindent: */
3
/* This Source Code Form is subject to the terms of the Mozilla Public
4
 * License, v. 2.0. If a copy of the MPL was not distributed with this
5
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
6
7
#include "SpeechRecognition.h"
8
9
#include "nsCOMPtr.h"
10
#include "nsCycleCollectionParticipant.h"
11
12
#include "mozilla/dom/BindingUtils.h"
13
#include "mozilla/dom/Element.h"
14
#include "mozilla/dom/SpeechRecognitionBinding.h"
15
#include "mozilla/dom/MediaStreamTrackBinding.h"
16
#include "mozilla/dom/MediaStreamError.h"
17
#include "mozilla/MediaManager.h"
18
#include "mozilla/Preferences.h"
19
#include "mozilla/Services.h"
20
#include "mozilla/StaticPrefs.h"
21
22
#include "AudioSegment.h"
23
#include "DOMMediaStream.h"
24
#include "MediaEnginePrefs.h"
25
#include "endpointer.h"
26
27
#include "mozilla/dom/SpeechRecognitionEvent.h"
28
#include "nsContentUtils.h"
29
#include "nsIDocument.h"
30
#include "nsIObserverService.h"
31
#include "nsIPermissionManager.h"
32
#include "nsIPrincipal.h"
33
#include "nsPIDOMWindow.h"
34
#include "nsServiceManagerUtils.h"
35
#include "nsQueryObject.h"
36
37
#include <algorithm>
38
39
// Undo the windows.h damage
40
#if defined(XP_WIN) && defined(GetMessage)
41
#undef GetMessage
42
#endif
43
44
namespace mozilla {
45
namespace dom {
46
47
0
#define PREFERENCE_DEFAULT_RECOGNITION_SERVICE "media.webspeech.service.default"
48
#define DEFAULT_RECOGNITION_SERVICE_PREFIX "pocketsphinx-"
49
0
#define DEFAULT_RECOGNITION_SERVICE "pocketsphinx-en-US"
50
51
0
#define PREFERENCE_ENDPOINTER_SILENCE_LENGTH "media.webspeech.silence_length"
52
0
#define PREFERENCE_ENDPOINTER_LONG_SILENCE_LENGTH "media.webspeech.long_silence_length"
53
#define PREFERENCE_ENDPOINTER_LONG_SPEECH_LENGTH "media.webspeech.long_speech_length"
54
55
static const uint32_t kSAMPLE_RATE = 16000;
56
static const uint32_t kSPEECH_DETECTION_TIMEOUT_MS = 10000;
57
58
// number of frames corresponding to 300ms of audio to send to endpointer while
59
// it's in environment estimation mode
60
// kSAMPLE_RATE frames = 1s, kESTIMATION_FRAMES frames = 300ms
61
static const uint32_t kESTIMATION_SAMPLES = 300 * kSAMPLE_RATE / 1000;
62
63
LogModule*
64
GetSpeechRecognitionLog()
65
0
{
66
0
  static LazyLogModule sLog("SpeechRecognition");
67
0
  return sLog;
68
0
}
69
0
#define SR_LOG(...) MOZ_LOG(GetSpeechRecognitionLog(), mozilla::LogLevel::Debug, (__VA_ARGS__))
70
71
already_AddRefed<nsISpeechRecognitionService>
72
GetSpeechRecognitionService(const nsAString& aLang)
73
0
{
74
0
  nsAutoCString speechRecognitionServiceCID;
75
0
76
0
  nsAutoCString prefValue;
77
0
  Preferences::GetCString(PREFERENCE_DEFAULT_RECOGNITION_SERVICE, prefValue);
78
0
  nsAutoCString speechRecognitionService;
79
0
80
0
  if (!aLang.IsEmpty()) {
81
0
    speechRecognitionService =
82
0
      NS_LITERAL_CSTRING(DEFAULT_RECOGNITION_SERVICE_PREFIX) +
83
0
      NS_ConvertUTF16toUTF8(aLang);
84
0
  } else if (!prefValue.IsEmpty()) {
85
0
    speechRecognitionService = prefValue;
86
0
  } else {
87
0
    speechRecognitionService = DEFAULT_RECOGNITION_SERVICE;
88
0
  }
89
0
90
0
  if (StaticPrefs::MediaWebspeechTextFakeRecognitionService()) {
91
0
    speechRecognitionServiceCID =
92
0
      NS_SPEECH_RECOGNITION_SERVICE_CONTRACTID_PREFIX "fake";
93
0
  } else {
94
0
    speechRecognitionServiceCID =
95
0
      NS_LITERAL_CSTRING(NS_SPEECH_RECOGNITION_SERVICE_CONTRACTID_PREFIX) +
96
0
      speechRecognitionService;
97
0
  }
98
0
99
0
  nsresult rv;
100
0
  nsCOMPtr<nsISpeechRecognitionService> recognitionService;
101
0
  recognitionService = do_GetService(speechRecognitionServiceCID.get(), &rv);
102
0
  return recognitionService.forget();
103
0
}
104
105
NS_IMPL_CYCLE_COLLECTION_INHERITED(SpeechRecognition,
106
                                   DOMEventTargetHelper,
107
                                   mDOMStream,
108
                                   mSpeechGrammarList)
109
110
0
NS_INTERFACE_MAP_BEGIN_CYCLE_COLLECTION(SpeechRecognition)
111
0
  NS_INTERFACE_MAP_ENTRY(nsIObserver)
112
0
NS_INTERFACE_MAP_END_INHERITING(DOMEventTargetHelper)
113
114
NS_IMPL_ADDREF_INHERITED(SpeechRecognition, DOMEventTargetHelper)
115
NS_IMPL_RELEASE_INHERITED(SpeechRecognition, DOMEventTargetHelper)
116
117
SpeechRecognition::SpeechRecognition(nsPIDOMWindowInner* aOwnerWindow)
118
  : DOMEventTargetHelper(aOwnerWindow)
119
  , mEndpointer(kSAMPLE_RATE)
120
  , mAudioSamplesPerChunk(mEndpointer.FrameSize())
121
  , mSpeechDetectionTimer(NS_NewTimer())
122
  , mSpeechGrammarList(new SpeechGrammarList(GetParentObject()))
123
  , mInterimResults(false)
124
  , mMaxAlternatives(1)
125
0
{
126
0
  SR_LOG("created SpeechRecognition");
127
0
128
0
  if (StaticPrefs::MediaWebspeechTestEnable()) {
129
0
    nsCOMPtr<nsIObserverService> obs = services::GetObserverService();
130
0
    obs->AddObserver(this, SPEECH_RECOGNITION_TEST_EVENT_REQUEST_TOPIC, false);
131
0
    obs->AddObserver(this, SPEECH_RECOGNITION_TEST_END_TOPIC, false);
132
0
  }
133
0
134
0
  mEndpointer.set_speech_input_complete_silence_length(
135
0
      Preferences::GetInt(PREFERENCE_ENDPOINTER_SILENCE_LENGTH, 1250000));
136
0
  mEndpointer.set_long_speech_input_complete_silence_length(
137
0
      Preferences::GetInt(PREFERENCE_ENDPOINTER_LONG_SILENCE_LENGTH, 2500000));
138
0
  mEndpointer.set_long_speech_length(
139
0
      Preferences::GetInt(PREFERENCE_ENDPOINTER_SILENCE_LENGTH, 3 * 1000000));
140
0
  Reset();
141
0
}
142
143
bool
144
SpeechRecognition::StateBetween(FSMState begin, FSMState end)
145
0
{
146
0
  return mCurrentState >= begin && mCurrentState <= end;
147
0
}
148
149
void
150
SpeechRecognition::SetState(FSMState state)
151
0
{
152
0
  mCurrentState = state;
153
0
  SR_LOG("Transitioned to state %s", GetName(mCurrentState));
154
0
}
155
156
JSObject*
157
SpeechRecognition::WrapObject(JSContext* aCx, JS::Handle<JSObject*> aGivenProto)
158
0
{
159
0
  return SpeechRecognition_Binding::Wrap(aCx, this, aGivenProto);
160
0
}
161
162
bool
163
SpeechRecognition::IsAuthorized(JSContext* aCx, JSObject* aGlobal)
164
0
{
165
0
  nsCOMPtr<nsIPrincipal> principal = nsContentUtils::ObjectPrincipal(aGlobal);
166
0
167
0
  nsresult rv;
168
0
  nsCOMPtr<nsIPermissionManager> mgr =
169
0
    do_GetService(NS_PERMISSIONMANAGER_CONTRACTID, &rv);
170
0
  if (NS_WARN_IF(NS_FAILED(rv))) {
171
0
    return false;
172
0
  }
173
0
174
0
  uint32_t speechRecognition = nsIPermissionManager::UNKNOWN_ACTION;
175
0
  rv = mgr->TestExactPermissionFromPrincipal(
176
0
    principal, "speech-recognition", &speechRecognition);
177
0
  if (NS_WARN_IF(NS_FAILED(rv))) {
178
0
    return false;
179
0
  }
180
0
181
0
  bool hasPermission =
182
0
    (speechRecognition == nsIPermissionManager::ALLOW_ACTION);
183
0
184
0
  return (hasPermission ||
185
0
          StaticPrefs::MediaWebspeechRecognitionForceEnable() ||
186
0
          StaticPrefs::MediaWebspeechTestEnable()) &&
187
0
         StaticPrefs::MediaWebspeechRecognitionEnable();
188
0
}
189
190
already_AddRefed<SpeechRecognition>
191
SpeechRecognition::Constructor(const GlobalObject& aGlobal,
192
                               ErrorResult& aRv)
193
0
{
194
0
  nsCOMPtr<nsPIDOMWindowInner> win = do_QueryInterface(aGlobal.GetAsSupports());
195
0
  if (!win) {
196
0
    aRv.Throw(NS_ERROR_FAILURE);
197
0
    return nullptr;
198
0
  }
199
0
200
0
  RefPtr<SpeechRecognition> object = new SpeechRecognition(win);
201
0
  return object.forget();
202
0
}
203
204
nsISupports*
205
SpeechRecognition::GetParentObject() const
206
0
{
207
0
  return GetOwner();
208
0
}
209
210
void
211
SpeechRecognition::ProcessEvent(SpeechEvent* aEvent)
212
0
{
213
0
  SR_LOG("Processing %s, current state is %s",
214
0
         GetName(aEvent),
215
0
         GetName(mCurrentState));
216
0
217
0
  if (mAborted && aEvent->mType != EVENT_ABORT) {
218
0
    // ignore all events while aborting
219
0
    return;
220
0
  }
221
0
222
0
  Transition(aEvent);
223
0
}
224
225
void
226
SpeechRecognition::Transition(SpeechEvent* aEvent)
227
0
{
228
0
  switch (mCurrentState) {
229
0
    case STATE_IDLE:
230
0
      switch (aEvent->mType) {
231
0
        case EVENT_START:
232
0
          // TODO: may want to time out if we wait too long
233
0
          // for user to approve
234
0
          WaitForAudioData(aEvent);
235
0
          break;
236
0
        case EVENT_STOP:
237
0
        case EVENT_ABORT:
238
0
        case EVENT_AUDIO_DATA:
239
0
        case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT:
240
0
        case EVENT_RECOGNITIONSERVICE_FINAL_RESULT:
241
0
          DoNothing(aEvent);
242
0
          break;
243
0
        case EVENT_AUDIO_ERROR:
244
0
        case EVENT_RECOGNITIONSERVICE_ERROR:
245
0
          AbortError(aEvent);
246
0
          break;
247
0
        case EVENT_COUNT:
248
0
          MOZ_CRASH("Invalid event EVENT_COUNT");
249
0
      }
250
0
      break;
251
0
    case STATE_STARTING:
252
0
      switch (aEvent->mType) {
253
0
        case EVENT_AUDIO_DATA:
254
0
          StartedAudioCapture(aEvent);
255
0
          break;
256
0
        case EVENT_AUDIO_ERROR:
257
0
        case EVENT_RECOGNITIONSERVICE_ERROR:
258
0
          AbortError(aEvent);
259
0
          break;
260
0
        case EVENT_ABORT:
261
0
          AbortSilently(aEvent);
262
0
          break;
263
0
        case EVENT_STOP:
264
0
          Reset();
265
0
          break;
266
0
        case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT:
267
0
        case EVENT_RECOGNITIONSERVICE_FINAL_RESULT:
268
0
          DoNothing(aEvent);
269
0
          break;
270
0
        case EVENT_START:
271
0
          SR_LOG("STATE_STARTING: Unhandled event %s", GetName(aEvent));
272
0
          MOZ_CRASH();
273
0
        case EVENT_COUNT:
274
0
          MOZ_CRASH("Invalid event EVENT_COUNT");
275
0
      }
276
0
      break;
277
0
    case STATE_ESTIMATING:
278
0
      switch (aEvent->mType) {
279
0
        case EVENT_AUDIO_DATA:
280
0
          WaitForEstimation(aEvent);
281
0
          break;
282
0
        case EVENT_STOP:
283
0
          StopRecordingAndRecognize(aEvent);
284
0
          break;
285
0
        case EVENT_ABORT:
286
0
          AbortSilently(aEvent);
287
0
          break;
288
0
        case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT:
289
0
        case EVENT_RECOGNITIONSERVICE_FINAL_RESULT:
290
0
        case EVENT_RECOGNITIONSERVICE_ERROR:
291
0
          DoNothing(aEvent);
292
0
          break;
293
0
        case EVENT_AUDIO_ERROR:
294
0
          AbortError(aEvent);
295
0
          break;
296
0
        case EVENT_START:
297
0
          SR_LOG("STATE_ESTIMATING: Unhandled event %d", aEvent->mType);
298
0
          MOZ_CRASH();
299
0
        case EVENT_COUNT:
300
0
          MOZ_CRASH("Invalid event EVENT_COUNT");
301
0
      }
302
0
      break;
303
0
    case STATE_WAITING_FOR_SPEECH:
304
0
      switch (aEvent->mType) {
305
0
        case EVENT_AUDIO_DATA:
306
0
          DetectSpeech(aEvent);
307
0
          break;
308
0
        case EVENT_STOP:
309
0
          StopRecordingAndRecognize(aEvent);
310
0
          break;
311
0
        case EVENT_ABORT:
312
0
          AbortSilently(aEvent);
313
0
          break;
314
0
        case EVENT_AUDIO_ERROR:
315
0
          AbortError(aEvent);
316
0
          break;
317
0
        case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT:
318
0
        case EVENT_RECOGNITIONSERVICE_FINAL_RESULT:
319
0
        case EVENT_RECOGNITIONSERVICE_ERROR:
320
0
          DoNothing(aEvent);
321
0
          break;
322
0
        case EVENT_START:
323
0
          SR_LOG("STATE_STARTING: Unhandled event %s", GetName(aEvent));
324
0
          MOZ_CRASH();
325
0
        case EVENT_COUNT:
326
0
          MOZ_CRASH("Invalid event EVENT_COUNT");
327
0
      }
328
0
      break;
329
0
    case STATE_RECOGNIZING:
330
0
      switch (aEvent->mType) {
331
0
        case EVENT_AUDIO_DATA:
332
0
          WaitForSpeechEnd(aEvent);
333
0
          break;
334
0
        case EVENT_STOP:
335
0
          StopRecordingAndRecognize(aEvent);
336
0
          break;
337
0
        case EVENT_AUDIO_ERROR:
338
0
        case EVENT_RECOGNITIONSERVICE_ERROR:
339
0
          AbortError(aEvent);
340
0
          break;
341
0
        case EVENT_ABORT:
342
0
          AbortSilently(aEvent);
343
0
          break;
344
0
        case EVENT_RECOGNITIONSERVICE_FINAL_RESULT:
345
0
        case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT:
346
0
          DoNothing(aEvent);
347
0
          break;
348
0
        case EVENT_START:
349
0
          SR_LOG("STATE_RECOGNIZING: Unhandled aEvent %s", GetName(aEvent));
350
0
          MOZ_CRASH();
351
0
        case EVENT_COUNT:
352
0
          MOZ_CRASH("Invalid event EVENT_COUNT");
353
0
      }
354
0
      break;
355
0
    case STATE_WAITING_FOR_RESULT:
356
0
      switch (aEvent->mType) {
357
0
        case EVENT_STOP:
358
0
          DoNothing(aEvent);
359
0
          break;
360
0
        case EVENT_AUDIO_ERROR:
361
0
        case EVENT_RECOGNITIONSERVICE_ERROR:
362
0
          AbortError(aEvent);
363
0
          break;
364
0
        case EVENT_RECOGNITIONSERVICE_FINAL_RESULT:
365
0
          NotifyFinalResult(aEvent);
366
0
          break;
367
0
        case EVENT_AUDIO_DATA:
368
0
          DoNothing(aEvent);
369
0
          break;
370
0
        case EVENT_ABORT:
371
0
          AbortSilently(aEvent);
372
0
          break;
373
0
        case EVENT_START:
374
0
        case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT:
375
0
          SR_LOG("STATE_WAITING_FOR_RESULT: Unhandled aEvent %s",
376
0
                 GetName(aEvent));
377
0
          MOZ_CRASH();
378
0
        case EVENT_COUNT:
379
0
          MOZ_CRASH("Invalid event EVENT_COUNT");
380
0
      }
381
0
      break;
382
0
    case STATE_COUNT:
383
0
      MOZ_CRASH("Invalid state STATE_COUNT");
384
0
  }
385
0
}
386
387
/*
388
 * Handle a segment of recorded audio data.
389
 * Returns the number of samples that were processed.
390
 */
391
uint32_t
392
SpeechRecognition::ProcessAudioSegment(AudioSegment* aSegment,
393
                                       TrackRate aTrackRate)
394
0
{
395
0
  AudioSegment::ChunkIterator iterator(*aSegment);
396
0
  uint32_t samples = 0;
397
0
  while (!iterator.IsEnded()) {
398
0
    float out;
399
0
    mEndpointer.ProcessAudio(*iterator, &out);
400
0
    samples += iterator->GetDuration();
401
0
    iterator.Next();
402
0
  }
403
0
404
0
  mRecognitionService->ProcessAudioSegment(aSegment, aTrackRate);
405
0
  return samples;
406
0
}
407
408
/****************************************************************************
409
 * FSM Transition functions
410
 *
411
 * If a transition function may cause a DOM event to be fired,
412
 * it may also be re-entered, since the event handler may cause the
413
 * event loop to spin and new SpeechEvents to be processed.
414
 *
415
 * Rules:
416
 * 1) These methods should call SetState as soon as possible.
417
 * 2) If these methods dispatch DOM events, or call methods that dispatch
418
 * DOM events, that should be done as late as possible.
419
 * 3) If anything must happen after dispatching a DOM event, make sure
420
 * the state is still what the method expected it to be.
421
 ****************************************************************************/
422
423
void
424
SpeechRecognition::Reset()
425
0
{
426
0
  SetState(STATE_IDLE);
427
0
  mRecognitionService = nullptr;
428
0
  mEstimationSamples = 0;
429
0
  mBufferedSamples = 0;
430
0
  mSpeechDetectionTimer->Cancel();
431
0
  mAborted = false;
432
0
}
433
434
void
435
SpeechRecognition::ResetAndEnd()
436
0
{
437
0
  Reset();
438
0
  DispatchTrustedEvent(NS_LITERAL_STRING("end"));
439
0
}
440
441
void
442
SpeechRecognition::WaitForAudioData(SpeechEvent* aEvent)
443
0
{
444
0
  SetState(STATE_STARTING);
445
0
}
446
447
void
448
SpeechRecognition::StartedAudioCapture(SpeechEvent* aEvent)
449
0
{
450
0
  SetState(STATE_ESTIMATING);
451
0
452
0
  mEndpointer.SetEnvironmentEstimationMode();
453
0
  mEstimationSamples +=
454
0
    ProcessAudioSegment(aEvent->mAudioSegment, aEvent->mTrackRate);
455
0
456
0
  DispatchTrustedEvent(NS_LITERAL_STRING("audiostart"));
457
0
  if (mCurrentState == STATE_ESTIMATING) {
458
0
    DispatchTrustedEvent(NS_LITERAL_STRING("start"));
459
0
  }
460
0
}
461
462
void
463
SpeechRecognition::StopRecordingAndRecognize(SpeechEvent* aEvent)
464
0
{
465
0
  SetState(STATE_WAITING_FOR_RESULT);
466
0
467
0
  MOZ_ASSERT(mRecognitionService, "Service deleted before recording done");
468
0
  mRecognitionService->SoundEnd();
469
0
470
0
  StopRecording();
471
0
}
472
473
void
474
SpeechRecognition::WaitForEstimation(SpeechEvent* aEvent)
475
0
{
476
0
  SetState(STATE_ESTIMATING);
477
0
478
0
  mEstimationSamples +=
479
0
    ProcessAudioSegment(aEvent->mAudioSegment, aEvent->mTrackRate);
480
0
  if (mEstimationSamples > kESTIMATION_SAMPLES) {
481
0
    mEndpointer.SetUserInputMode();
482
0
    SetState(STATE_WAITING_FOR_SPEECH);
483
0
  }
484
0
}
485
486
void
487
SpeechRecognition::DetectSpeech(SpeechEvent* aEvent)
488
0
{
489
0
  SetState(STATE_WAITING_FOR_SPEECH);
490
0
491
0
  ProcessAudioSegment(aEvent->mAudioSegment, aEvent->mTrackRate);
492
0
  if (mEndpointer.DidStartReceivingSpeech()) {
493
0
    mSpeechDetectionTimer->Cancel();
494
0
    SetState(STATE_RECOGNIZING);
495
0
    DispatchTrustedEvent(NS_LITERAL_STRING("speechstart"));
496
0
  }
497
0
}
498
499
void
500
SpeechRecognition::WaitForSpeechEnd(SpeechEvent* aEvent)
501
0
{
502
0
  SetState(STATE_RECOGNIZING);
503
0
504
0
  ProcessAudioSegment(aEvent->mAudioSegment, aEvent->mTrackRate);
505
0
  if (mEndpointer.speech_input_complete()) {
506
0
    DispatchTrustedEvent(NS_LITERAL_STRING("speechend"));
507
0
508
0
    if (mCurrentState == STATE_RECOGNIZING) {
509
0
      // FIXME: StopRecordingAndRecognize should only be called for single
510
0
      // shot services for continuous we should just inform the service
511
0
      StopRecordingAndRecognize(aEvent);
512
0
    }
513
0
  }
514
0
}
515
516
void
517
SpeechRecognition::NotifyFinalResult(SpeechEvent* aEvent)
518
0
{
519
0
  ResetAndEnd();
520
0
521
0
  RootedDictionary<SpeechRecognitionEventInit> init(RootingCx());
522
0
  init.mBubbles = true;
523
0
  init.mCancelable = false;
524
0
  // init.mResultIndex = 0;
525
0
  init.mResults = aEvent->mRecognitionResultList;
526
0
  init.mInterpretation = JS::NullValue();
527
0
  // init.mEmma = nullptr;
528
0
529
0
  RefPtr<SpeechRecognitionEvent> event = SpeechRecognitionEvent::Constructor(
530
0
    this, NS_LITERAL_STRING("result"), init);
531
0
  event->SetTrusted(true);
532
0
533
0
  DispatchEvent(*event);
534
0
}
535
536
void
537
SpeechRecognition::DoNothing(SpeechEvent* aEvent)
538
0
{
539
0
}
540
541
void
542
SpeechRecognition::AbortSilently(SpeechEvent* aEvent)
543
0
{
544
0
  if (mRecognitionService) {
545
0
    mRecognitionService->Abort();
546
0
  }
547
0
548
0
  if (mDOMStream) {
549
0
    StopRecording();
550
0
  }
551
0
552
0
  ResetAndEnd();
553
0
}
554
555
void
556
SpeechRecognition::AbortError(SpeechEvent* aEvent)
557
0
{
558
0
  AbortSilently(aEvent);
559
0
  NotifyError(aEvent);
560
0
}
561
562
void
563
SpeechRecognition::NotifyError(SpeechEvent* aEvent)
564
0
{
565
0
  aEvent->mError->SetTrusted(true);
566
0
567
0
  DispatchEvent(*aEvent->mError);
568
0
}
569
570
/**************************************
571
 * Event triggers and other functions *
572
 **************************************/
573
NS_IMETHODIMP
574
SpeechRecognition::StartRecording(DOMMediaStream* aDOMStream)
575
0
{
576
0
  // hold a reference so that the underlying stream
577
0
  // doesn't get Destroy()'ed
578
0
  mDOMStream = aDOMStream;
579
0
580
0
  if (NS_WARN_IF(!mDOMStream->GetPlaybackStream())) {
581
0
    return NS_ERROR_UNEXPECTED;
582
0
  }
583
0
  mSpeechListener = new SpeechStreamListener(this);
584
0
  mDOMStream->GetPlaybackStream()->AddListener(mSpeechListener);
585
0
586
0
  mEndpointer.StartSession();
587
0
588
0
  return mSpeechDetectionTimer->Init(this, kSPEECH_DETECTION_TIMEOUT_MS,
589
0
                                     nsITimer::TYPE_ONE_SHOT);
590
0
}
591
592
NS_IMETHODIMP
593
SpeechRecognition::StopRecording()
594
0
{
595
0
  // we only really need to remove the listener explicitly when testing,
596
0
  // as our JS code still holds a reference to mDOMStream and only assigning
597
0
  // it to nullptr isn't guaranteed to free the stream and the listener.
598
0
  mDOMStream->GetPlaybackStream()->RemoveListener(mSpeechListener);
599
0
  mSpeechListener = nullptr;
600
0
  mDOMStream = nullptr;
601
0
602
0
  mEndpointer.EndSession();
603
0
  DispatchTrustedEvent(NS_LITERAL_STRING("audioend"));
604
0
605
0
  return NS_OK;
606
0
}
607
608
NS_IMETHODIMP
609
SpeechRecognition::Observe(nsISupports* aSubject, const char* aTopic,
610
                           const char16_t* aData)
611
0
{
612
0
  MOZ_ASSERT(NS_IsMainThread(), "Observer invoked off the main thread");
613
0
614
0
  if (!strcmp(aTopic, NS_TIMER_CALLBACK_TOPIC) &&
615
0
      StateBetween(STATE_IDLE, STATE_WAITING_FOR_SPEECH)) {
616
0
617
0
    DispatchError(SpeechRecognition::EVENT_AUDIO_ERROR,
618
0
                  SpeechRecognitionErrorCode::No_speech,
619
0
                  NS_LITERAL_STRING("No speech detected (timeout)"));
620
0
  } else if (!strcmp(aTopic, SPEECH_RECOGNITION_TEST_END_TOPIC)) {
621
0
    nsCOMPtr<nsIObserverService> obs = services::GetObserverService();
622
0
    obs->RemoveObserver(this, SPEECH_RECOGNITION_TEST_EVENT_REQUEST_TOPIC);
623
0
    obs->RemoveObserver(this, SPEECH_RECOGNITION_TEST_END_TOPIC);
624
0
  } else if (StaticPrefs::MediaWebspeechTextFakeFsmEvents() &&
625
0
             !strcmp(aTopic, SPEECH_RECOGNITION_TEST_EVENT_REQUEST_TOPIC)) {
626
0
    ProcessTestEventRequest(aSubject, nsDependentString(aData));
627
0
  }
628
0
629
0
  return NS_OK;
630
0
}
631
632
void
633
SpeechRecognition::ProcessTestEventRequest(nsISupports* aSubject,
634
                                           const nsAString& aEventName)
635
0
{
636
0
  if (aEventName.EqualsLiteral("EVENT_ABORT")) {
637
0
    Abort();
638
0
  } else if (aEventName.EqualsLiteral("EVENT_AUDIO_ERROR")) {
639
0
    DispatchError(
640
0
      SpeechRecognition::EVENT_AUDIO_ERROR,
641
0
      SpeechRecognitionErrorCode::Audio_capture, // TODO different codes?
642
0
      NS_LITERAL_STRING("AUDIO_ERROR test event"));
643
0
  } else {
644
0
    NS_ASSERTION(StaticPrefs::MediaWebspeechTextFakeRecognitionService(),
645
0
                 "Got request for fake recognition service event, but "
646
0
                 "media.webspeech.test.fake_recognition_service is unset");
647
0
648
0
    // let the fake recognition service handle the request
649
0
  }
650
0
}
651
652
already_AddRefed<SpeechGrammarList>
653
SpeechRecognition::Grammars() const
654
0
{
655
0
  RefPtr<SpeechGrammarList> speechGrammarList = mSpeechGrammarList;
656
0
  return speechGrammarList.forget();
657
0
}
658
659
void
660
SpeechRecognition::SetGrammars(SpeechGrammarList& aArg)
661
0
{
662
0
  mSpeechGrammarList = &aArg;
663
0
}
664
665
void
666
SpeechRecognition::GetLang(nsString& aRetVal) const
667
0
{
668
0
  aRetVal = mLang;
669
0
}
670
671
void
672
SpeechRecognition::SetLang(const nsAString& aArg)
673
0
{
674
0
  mLang = aArg;
675
0
}
676
677
bool
678
SpeechRecognition::GetContinuous(ErrorResult& aRv) const
679
0
{
680
0
  aRv.Throw(NS_ERROR_NOT_IMPLEMENTED);
681
0
  return false;
682
0
}
683
684
void
685
SpeechRecognition::SetContinuous(bool aArg, ErrorResult& aRv)
686
0
{
687
0
  aRv.Throw(NS_ERROR_NOT_IMPLEMENTED);
688
0
}
689
690
bool
691
SpeechRecognition::InterimResults() const
692
0
{
693
0
  return mInterimResults;
694
0
}
695
696
void
697
SpeechRecognition::SetInterimResults(bool aArg)
698
0
{
699
0
  mInterimResults = aArg;
700
0
}
701
702
uint32_t
703
SpeechRecognition::MaxAlternatives() const
704
0
{
705
0
  return mMaxAlternatives;
706
0
}
707
708
void
709
SpeechRecognition::SetMaxAlternatives(uint32_t aArg)
710
0
{
711
0
  mMaxAlternatives = aArg;
712
0
}
713
714
void
715
SpeechRecognition::GetServiceURI(nsString& aRetVal, ErrorResult& aRv) const
716
0
{
717
0
  aRv.Throw(NS_ERROR_NOT_IMPLEMENTED);
718
0
}
719
720
void
721
SpeechRecognition::SetServiceURI(const nsAString& aArg, ErrorResult& aRv)
722
0
{
723
0
  aRv.Throw(NS_ERROR_NOT_IMPLEMENTED);
724
0
}
725
726
void
727
SpeechRecognition::Start(const Optional<NonNull<DOMMediaStream>>& aStream,
728
                         CallerType aCallerType,
729
                         ErrorResult& aRv)
730
0
{
731
0
  if (mCurrentState != STATE_IDLE) {
732
0
    aRv.Throw(NS_ERROR_DOM_INVALID_STATE_ERR);
733
0
    return;
734
0
  }
735
0
736
0
  if (!SetRecognitionService(aRv)) {
737
0
    return;
738
0
  }
739
0
740
0
  if (!ValidateAndSetGrammarList(aRv)) {
741
0
    return;
742
0
  }
743
0
744
0
  nsresult rv;
745
0
  rv = mRecognitionService->Initialize(this);
746
0
  if (NS_WARN_IF(NS_FAILED(rv))) {
747
0
    return;
748
0
  }
749
0
750
0
  MediaStreamConstraints constraints;
751
0
  constraints.mAudio.SetAsBoolean() = true;
752
0
753
0
  if (aStream.WasPassed()) {
754
0
    StartRecording(&aStream.Value());
755
0
  } else {
756
0
    AutoNoJSAPI nojsapi;
757
0
    MediaManager* manager = MediaManager::Get();
758
0
    MediaManager::GetUserMediaSuccessCallback onsuccess(
759
0
      new GetUserMediaSuccessCallback(this));
760
0
    MediaManager::GetUserMediaErrorCallback onerror(
761
0
      new GetUserMediaErrorCallback(this));
762
0
    manager->GetUserMedia(GetOwner(),
763
0
                          constraints,
764
0
                          std::move(onsuccess),
765
0
                          std::move(onerror),
766
0
                          aCallerType);
767
0
  }
768
0
769
0
  RefPtr<SpeechEvent> event = new SpeechEvent(this, EVENT_START);
770
0
  NS_DispatchToMainThread(event);
771
0
}
772
773
bool
774
SpeechRecognition::SetRecognitionService(ErrorResult& aRv)
775
0
{
776
0
  // See: https://dvcs.w3.org/hg/speech-api/raw-file/tip/webspeechapi.html#dfn-lang
777
0
  if (!mLang.IsEmpty()) {
778
0
    mRecognitionService = GetSpeechRecognitionService(mLang);
779
0
780
0
    if (!mRecognitionService) {
781
0
      aRv.Throw(NS_ERROR_DOM_INVALID_STATE_ERR);
782
0
      return false;
783
0
    }
784
0
785
0
    return true;
786
0
  }
787
0
788
0
  nsCOMPtr<nsPIDOMWindowInner> window = GetOwner();
789
0
  if(!window) {
790
0
    aRv.Throw(NS_ERROR_DOM_INVALID_STATE_ERR);
791
0
    return false;
792
0
  }
793
0
  nsCOMPtr<nsIDocument> document = window->GetExtantDoc();
794
0
  if(!document) {
795
0
    aRv.Throw(NS_ERROR_DOM_INVALID_STATE_ERR);
796
0
    return false;
797
0
  }
798
0
  nsCOMPtr<Element> element = document->GetRootElement();
799
0
  if(!element) {
800
0
    aRv.Throw(NS_ERROR_DOM_INVALID_STATE_ERR);
801
0
    return false;
802
0
  }
803
0
804
0
  nsAutoString lang;
805
0
  element->GetLang(lang);
806
0
  mRecognitionService = GetSpeechRecognitionService(lang);
807
0
808
0
  if (!mRecognitionService) {
809
0
    aRv.Throw(NS_ERROR_DOM_INVALID_STATE_ERR);
810
0
    return false;
811
0
  }
812
0
813
0
  return true;
814
0
}
815
816
bool
817
SpeechRecognition::ValidateAndSetGrammarList(ErrorResult& aRv)
818
0
{
819
0
  if (!mSpeechGrammarList) {
820
0
    aRv.Throw(NS_ERROR_DOM_INVALID_STATE_ERR);
821
0
    return false;
822
0
  }
823
0
824
0
  uint32_t grammarListLength = mSpeechGrammarList->Length();
825
0
  if (0 == grammarListLength) {
826
0
    aRv.Throw(NS_ERROR_DOM_INVALID_STATE_ERR);
827
0
    return false;
828
0
  }
829
0
830
0
  for (uint32_t count = 0; count < grammarListLength; ++count) {
831
0
    RefPtr<SpeechGrammar> speechGrammar = mSpeechGrammarList->Item(count, aRv);
832
0
    if (aRv.Failed()) {
833
0
      return false;
834
0
    }
835
0
    if (NS_FAILED(mRecognitionService->ValidateAndSetGrammarList(
836
0
          speechGrammar.get(), nullptr))) {
837
0
      aRv.Throw(NS_ERROR_DOM_INVALID_STATE_ERR);
838
0
      return false;
839
0
    }
840
0
  }
841
0
842
0
  return true;
843
0
}
844
845
void
846
SpeechRecognition::Stop()
847
0
{
848
0
  RefPtr<SpeechEvent> event = new SpeechEvent(this, EVENT_STOP);
849
0
  NS_DispatchToMainThread(event);
850
0
}
851
852
void
853
SpeechRecognition::Abort()
854
0
{
855
0
  if (mAborted) {
856
0
    return;
857
0
  }
858
0
859
0
  mAborted = true;
860
0
  RefPtr<SpeechEvent> event = new SpeechEvent(this, EVENT_ABORT);
861
0
  NS_DispatchToMainThread(event);
862
0
}
863
864
void
865
SpeechRecognition::DispatchError(EventType aErrorType,
866
                                 SpeechRecognitionErrorCode aErrorCode,
867
                                 const nsAString& aMessage)
868
0
{
869
0
  MOZ_ASSERT(NS_IsMainThread());
870
0
  MOZ_ASSERT(aErrorType == EVENT_RECOGNITIONSERVICE_ERROR ||
871
0
             aErrorType == EVENT_AUDIO_ERROR, "Invalid error type!");
872
0
873
0
  RefPtr<SpeechRecognitionError> srError =
874
0
    new SpeechRecognitionError(nullptr, nullptr, nullptr);
875
0
876
0
  srError->InitSpeechRecognitionError(NS_LITERAL_STRING("error"), true, false,
877
0
                                      aErrorCode, aMessage);
878
0
879
0
  RefPtr<SpeechEvent> event = new SpeechEvent(this, aErrorType);
880
0
  event->mError = srError;
881
0
  NS_DispatchToMainThread(event);
882
0
}
883
884
/*
885
 * Buffer audio samples into mAudioSamplesBuffer until aBufferSize.
886
 * Updates mBufferedSamples and returns the number of samples that were buffered.
887
 */
888
uint32_t
889
SpeechRecognition::FillSamplesBuffer(const int16_t* aSamples,
890
                                     uint32_t aSampleCount)
891
0
{
892
0
  MOZ_ASSERT(mBufferedSamples < mAudioSamplesPerChunk);
893
0
  MOZ_ASSERT(mAudioSamplesBuffer.get());
894
0
895
0
  int16_t* samplesBuffer = static_cast<int16_t*>(mAudioSamplesBuffer->Data());
896
0
  size_t samplesToCopy = std::min(aSampleCount,
897
0
                                  mAudioSamplesPerChunk - mBufferedSamples);
898
0
899
0
  memcpy(samplesBuffer + mBufferedSamples, aSamples,
900
0
         samplesToCopy * sizeof(int16_t));
901
0
902
0
  mBufferedSamples += samplesToCopy;
903
0
  return samplesToCopy;
904
0
}
905
906
/*
907
 * Split a samples buffer starting of a given size into
908
 * chunks of equal size. The chunks are stored in the array
909
 * received as argument.
910
 * Returns the offset of the end of the last chunk that was
911
 * created.
912
 */
913
uint32_t
914
SpeechRecognition::SplitSamplesBuffer(const int16_t* aSamplesBuffer,
915
                                      uint32_t aSampleCount,
916
                                      nsTArray<RefPtr<SharedBuffer>>& aResult)
917
0
{
918
0
  uint32_t chunkStart = 0;
919
0
920
0
  while (chunkStart + mAudioSamplesPerChunk <= aSampleCount) {
921
0
    RefPtr<SharedBuffer> chunk =
922
0
      SharedBuffer::Create(mAudioSamplesPerChunk * sizeof(int16_t));
923
0
924
0
    memcpy(chunk->Data(), aSamplesBuffer + chunkStart,
925
0
           mAudioSamplesPerChunk * sizeof(int16_t));
926
0
927
0
    aResult.AppendElement(chunk.forget());
928
0
    chunkStart += mAudioSamplesPerChunk;
929
0
  }
930
0
931
0
  return chunkStart;
932
0
}
933
934
AudioSegment*
935
SpeechRecognition::CreateAudioSegment(nsTArray<RefPtr<SharedBuffer>>& aChunks)
936
0
{
937
0
  AudioSegment* segment = new AudioSegment();
938
0
  for (uint32_t i = 0; i < aChunks.Length(); ++i) {
939
0
    RefPtr<SharedBuffer> buffer = aChunks[i];
940
0
    const int16_t* chunkData = static_cast<const int16_t*>(buffer->Data());
941
0
942
0
    AutoTArray<const int16_t*, 1> channels;
943
0
    channels.AppendElement(chunkData);
944
0
    segment->AppendFrames(buffer.forget(), channels, mAudioSamplesPerChunk,
945
0
                          PRINCIPAL_HANDLE_NONE);
946
0
  }
947
0
948
0
  return segment;
949
0
}
950
951
void
952
SpeechRecognition::FeedAudioData(already_AddRefed<SharedBuffer> aSamples,
953
                                 uint32_t aDuration,
954
                                 MediaStreamListener* aProvider,
955
                                 TrackRate aTrackRate)
956
0
{
957
0
  NS_ASSERTION(!NS_IsMainThread(),
958
0
               "FeedAudioData should not be called in the main thread");
959
0
960
0
  // Endpointer expects to receive samples in chunks whose size is a
961
0
  // multiple of its frame size.
962
0
  // Since we can't assume we will receive the frames in appropriate-sized
963
0
  // chunks, we must buffer and split them in chunks of mAudioSamplesPerChunk
964
0
  // (a multiple of Endpointer's frame size) before feeding to Endpointer.
965
0
966
0
  // ensure aSamples is deleted
967
0
  RefPtr<SharedBuffer> refSamples = aSamples;
968
0
969
0
  uint32_t samplesIndex = 0;
970
0
  const int16_t* samples = static_cast<int16_t*>(refSamples->Data());
971
0
  AutoTArray<RefPtr<SharedBuffer>, 5> chunksToSend;
972
0
973
0
  // fill up our buffer and make a chunk out of it, if possible
974
0
  if (mBufferedSamples > 0) {
975
0
    samplesIndex += FillSamplesBuffer(samples, aDuration);
976
0
977
0
    if (mBufferedSamples == mAudioSamplesPerChunk) {
978
0
      chunksToSend.AppendElement(mAudioSamplesBuffer.forget());
979
0
      mBufferedSamples = 0;
980
0
    }
981
0
  }
982
0
983
0
  // create sample chunks of correct size
984
0
  if (samplesIndex < aDuration) {
985
0
    samplesIndex += SplitSamplesBuffer(samples + samplesIndex,
986
0
                                       aDuration - samplesIndex,
987
0
                                       chunksToSend);
988
0
  }
989
0
990
0
  // buffer remaining samples
991
0
  if (samplesIndex < aDuration) {
992
0
    mBufferedSamples = 0;
993
0
    mAudioSamplesBuffer =
994
0
      SharedBuffer::Create(mAudioSamplesPerChunk * sizeof(int16_t));
995
0
996
0
    FillSamplesBuffer(samples + samplesIndex, aDuration - samplesIndex);
997
0
  }
998
0
999
0
  AudioSegment* segment = CreateAudioSegment(chunksToSend);
1000
0
  RefPtr<SpeechEvent> event = new SpeechEvent(this, EVENT_AUDIO_DATA);
1001
0
  event->mAudioSegment = segment;
1002
0
  event->mProvider = aProvider;
1003
0
  event->mTrackRate = aTrackRate;
1004
0
  NS_DispatchToMainThread(event);
1005
0
}
1006
1007
const char*
1008
SpeechRecognition::GetName(FSMState aId)
1009
0
{
1010
0
  static const char* names[] = {
1011
0
    "STATE_IDLE",
1012
0
    "STATE_STARTING",
1013
0
    "STATE_ESTIMATING",
1014
0
    "STATE_WAITING_FOR_SPEECH",
1015
0
    "STATE_RECOGNIZING",
1016
0
    "STATE_WAITING_FOR_RESULT",
1017
0
  };
1018
0
1019
0
  MOZ_ASSERT(aId < STATE_COUNT);
1020
0
  MOZ_ASSERT(ArrayLength(names) == STATE_COUNT);
1021
0
  return names[aId];
1022
0
}
1023
1024
const char*
1025
SpeechRecognition::GetName(SpeechEvent* aEvent)
1026
0
{
1027
0
  static const char* names[] = {
1028
0
    "EVENT_START",
1029
0
    "EVENT_STOP",
1030
0
    "EVENT_ABORT",
1031
0
    "EVENT_AUDIO_DATA",
1032
0
    "EVENT_AUDIO_ERROR",
1033
0
    "EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT",
1034
0
    "EVENT_RECOGNITIONSERVICE_FINAL_RESULT",
1035
0
    "EVENT_RECOGNITIONSERVICE_ERROR"
1036
0
  };
1037
0
1038
0
  MOZ_ASSERT(aEvent->mType < EVENT_COUNT);
1039
0
  MOZ_ASSERT(ArrayLength(names) == EVENT_COUNT);
1040
0
  return names[aEvent->mType];
1041
0
}
1042
1043
SpeechEvent::~SpeechEvent()
1044
0
{
1045
0
  delete mAudioSegment;
1046
0
}
1047
1048
NS_IMETHODIMP
1049
SpeechEvent::Run()
1050
0
{
1051
0
  mRecognition->ProcessEvent(this);
1052
0
  return NS_OK;
1053
0
}
1054
1055
NS_IMPL_ISUPPORTS(SpeechRecognition::GetUserMediaSuccessCallback,
1056
                  nsIDOMGetUserMediaSuccessCallback)
1057
1058
NS_IMETHODIMP
1059
SpeechRecognition::GetUserMediaSuccessCallback::OnSuccess(nsISupports* aStream)
1060
0
{
1061
0
  RefPtr<DOMMediaStream> stream = do_QueryObject(aStream);
1062
0
  if (!stream) {
1063
0
    return NS_ERROR_NO_INTERFACE;
1064
0
  }
1065
0
  mRecognition->StartRecording(stream);
1066
0
  return NS_OK;
1067
0
}
1068
1069
NS_IMPL_ISUPPORTS(SpeechRecognition::GetUserMediaErrorCallback,
1070
                  nsIDOMGetUserMediaErrorCallback)
1071
1072
NS_IMETHODIMP
1073
SpeechRecognition::GetUserMediaErrorCallback::OnError(nsISupports* aError)
1074
0
{
1075
0
  RefPtr<MediaStreamError> error = do_QueryObject(aError);
1076
0
  if (!error) {
1077
0
    return NS_OK;
1078
0
  }
1079
0
  SpeechRecognitionErrorCode errorCode;
1080
0
1081
0
  nsAutoString name;
1082
0
  error->GetName(name);
1083
0
  if (name.EqualsLiteral("PERMISSION_DENIED")) {
1084
0
    errorCode = SpeechRecognitionErrorCode::Not_allowed;
1085
0
  } else {
1086
0
    errorCode = SpeechRecognitionErrorCode::Audio_capture;
1087
0
  }
1088
0
1089
0
  nsAutoString message;
1090
0
  error->GetMessage(message);
1091
0
  mRecognition->DispatchError(SpeechRecognition::EVENT_AUDIO_ERROR, errorCode,
1092
0
                              message);
1093
0
  return NS_OK;
1094
0
}
1095
1096
} // namespace dom
1097
} // namespace mozilla