Coverage Report

Created: 2018-09-25 14:53

/src/mozilla-central/dom/media/webspeech/recognition/SpeechRecognition.h
Line
Count
Source (jump to first uncovered line)
1
/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2
/* vim:set ts=2 sw=2 sts=2 et cindent: */
3
/* This Source Code Form is subject to the terms of the Mozilla Public
4
 * License, v. 2.0. If a copy of the MPL was not distributed with this
5
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
6
7
#ifndef mozilla_dom_SpeechRecognition_h
8
#define mozilla_dom_SpeechRecognition_h
9
10
#include "mozilla/Attributes.h"
11
#include "mozilla/DOMEventTargetHelper.h"
12
#include "nsCOMPtr.h"
13
#include "nsString.h"
14
#include "nsWrapperCache.h"
15
#include "nsTArray.h"
16
#include "js/TypeDecls.h"
17
18
#include "DOMMediaStream.h"
19
#include "nsIDOMNavigatorUserMedia.h"
20
#include "nsITimer.h"
21
#include "MediaStreamGraph.h"
22
#include "AudioSegment.h"
23
#include "mozilla/WeakPtr.h"
24
25
#include "SpeechGrammarList.h"
26
#include "SpeechRecognitionResultList.h"
27
#include "SpeechStreamListener.h"
28
#include "nsISpeechRecognitionService.h"
29
#include "endpointer.h"
30
31
#include "mozilla/dom/BindingDeclarations.h"
32
#include "mozilla/dom/SpeechRecognitionError.h"
33
34
namespace mozilla {
35
36
class DOMMediaStream;
37
38
namespace dom {
39
40
0
#define SPEECH_RECOGNITION_TEST_EVENT_REQUEST_TOPIC "SpeechRecognitionTest:RequestEvent"
41
0
#define SPEECH_RECOGNITION_TEST_END_TOPIC "SpeechRecognitionTest:End"
42
43
class GlobalObject;
44
class SpeechEvent;
45
46
LogModule* GetSpeechRecognitionLog();
47
#define SR_LOG(...) MOZ_LOG(GetSpeechRecognitionLog(), mozilla::LogLevel::Debug, (__VA_ARGS__))
48
49
class SpeechRecognition final : public DOMEventTargetHelper,
50
                                public nsIObserver,
51
                                public SupportsWeakPtr<SpeechRecognition>
52
{
53
public:
54
  MOZ_DECLARE_WEAKREFERENCE_TYPENAME(SpeechRecognition)
55
  explicit SpeechRecognition(nsPIDOMWindowInner* aOwnerWindow);
56
57
  NS_DECL_ISUPPORTS_INHERITED
58
  NS_DECL_CYCLE_COLLECTION_CLASS_INHERITED(SpeechRecognition, DOMEventTargetHelper)
59
60
  NS_DECL_NSIOBSERVER
61
62
  nsISupports* GetParentObject() const;
63
64
  JSObject* WrapObject(JSContext* aCx, JS::Handle<JSObject*> aGivenProto) override;
65
66
  static bool IsAuthorized(JSContext* aCx, JSObject* aGlobal);
67
68
  static already_AddRefed<SpeechRecognition>
69
  Constructor(const GlobalObject& aGlobal, ErrorResult& aRv);
70
71
  already_AddRefed<SpeechGrammarList> Grammars() const;
72
73
  void SetGrammars(mozilla::dom::SpeechGrammarList& aArg);
74
75
  void GetLang(nsString& aRetVal) const;
76
77
  void SetLang(const nsAString& aArg);
78
79
  bool GetContinuous(ErrorResult& aRv) const;
80
81
  void SetContinuous(bool aArg, ErrorResult& aRv);
82
83
  bool InterimResults() const;
84
85
  void SetInterimResults(bool aArg);
86
87
  uint32_t MaxAlternatives() const;
88
89
  void SetMaxAlternatives(uint32_t aArg);
90
91
  void GetServiceURI(nsString& aRetVal, ErrorResult& aRv) const;
92
93
  void SetServiceURI(const nsAString& aArg, ErrorResult& aRv);
94
95
  void Start(const Optional<NonNull<DOMMediaStream>>& aStream,
96
             CallerType aCallerType, ErrorResult& aRv);
97
98
  void Stop();
99
100
  void Abort();
101
102
  IMPL_EVENT_HANDLER(audiostart)
103
  IMPL_EVENT_HANDLER(soundstart)
104
  IMPL_EVENT_HANDLER(speechstart)
105
  IMPL_EVENT_HANDLER(speechend)
106
  IMPL_EVENT_HANDLER(soundend)
107
  IMPL_EVENT_HANDLER(audioend)
108
  IMPL_EVENT_HANDLER(result)
109
  IMPL_EVENT_HANDLER(nomatch)
110
  IMPL_EVENT_HANDLER(error)
111
  IMPL_EVENT_HANDLER(start)
112
  IMPL_EVENT_HANDLER(end)
113
114
  enum EventType {
115
    EVENT_START,
116
    EVENT_STOP,
117
    EVENT_ABORT,
118
    EVENT_AUDIO_DATA,
119
    EVENT_AUDIO_ERROR,
120
    EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT,
121
    EVENT_RECOGNITIONSERVICE_FINAL_RESULT,
122
    EVENT_RECOGNITIONSERVICE_ERROR,
123
    EVENT_COUNT
124
  };
125
126
  void DispatchError(EventType aErrorType, SpeechRecognitionErrorCode aErrorCode, const nsAString& aMessage);
127
  uint32_t FillSamplesBuffer(const int16_t* aSamples, uint32_t aSampleCount);
128
  uint32_t SplitSamplesBuffer(const int16_t* aSamplesBuffer, uint32_t aSampleCount, nsTArray<RefPtr<SharedBuffer>>& aResult);
129
  AudioSegment* CreateAudioSegment(nsTArray<RefPtr<SharedBuffer>>& aChunks);
130
  void FeedAudioData(already_AddRefed<SharedBuffer> aSamples, uint32_t aDuration, MediaStreamListener* aProvider, TrackRate aTrackRate);
131
132
  friend class SpeechEvent;
133
private:
134
0
  virtual ~SpeechRecognition() {};
135
136
  enum FSMState {
137
    STATE_IDLE,
138
    STATE_STARTING,
139
    STATE_ESTIMATING,
140
    STATE_WAITING_FOR_SPEECH,
141
    STATE_RECOGNIZING,
142
    STATE_WAITING_FOR_RESULT,
143
    STATE_COUNT
144
  };
145
146
  void SetState(FSMState state);
147
  bool StateBetween(FSMState begin, FSMState end);
148
149
  bool SetRecognitionService(ErrorResult& aRv);
150
  bool ValidateAndSetGrammarList(ErrorResult& aRv);
151
152
  class GetUserMediaSuccessCallback : public nsIDOMGetUserMediaSuccessCallback
153
  {
154
  public:
155
    NS_DECL_ISUPPORTS
156
    NS_DECL_NSIDOMGETUSERMEDIASUCCESSCALLBACK
157
158
    explicit GetUserMediaSuccessCallback(SpeechRecognition* aRecognition)
159
      : mRecognition(aRecognition)
160
0
    {}
161
162
  private:
163
0
    virtual ~GetUserMediaSuccessCallback() {}
164
165
    RefPtr<SpeechRecognition> mRecognition;
166
  };
167
168
  class GetUserMediaErrorCallback : public nsIDOMGetUserMediaErrorCallback
169
  {
170
  public:
171
    NS_DECL_ISUPPORTS
172
    NS_DECL_NSIDOMGETUSERMEDIAERRORCALLBACK
173
174
    explicit GetUserMediaErrorCallback(SpeechRecognition* aRecognition)
175
      : mRecognition(aRecognition)
176
0
    {}
177
178
  private:
179
0
    virtual ~GetUserMediaErrorCallback() {}
180
181
    RefPtr<SpeechRecognition> mRecognition;
182
  };
183
184
  NS_IMETHOD StartRecording(DOMMediaStream* aDOMStream);
185
  NS_IMETHOD StopRecording();
186
187
  uint32_t ProcessAudioSegment(AudioSegment* aSegment, TrackRate aTrackRate);
188
  void NotifyError(SpeechEvent* aEvent);
189
190
  void ProcessEvent(SpeechEvent* aEvent);
191
  void Transition(SpeechEvent* aEvent);
192
193
  void Reset();
194
  void ResetAndEnd();
195
  void WaitForAudioData(SpeechEvent* aEvent);
196
  void StartedAudioCapture(SpeechEvent* aEvent);
197
  void StopRecordingAndRecognize(SpeechEvent* aEvent);
198
  void WaitForEstimation(SpeechEvent* aEvent);
199
  void DetectSpeech(SpeechEvent* aEvent);
200
  void WaitForSpeechEnd(SpeechEvent* aEvent);
201
  void NotifyFinalResult(SpeechEvent* aEvent);
202
  void DoNothing(SpeechEvent* aEvent);
203
  void AbortSilently(SpeechEvent* aEvent);
204
  void AbortError(SpeechEvent* aEvent);
205
206
  RefPtr<DOMMediaStream> mDOMStream;
207
  RefPtr<SpeechStreamListener> mSpeechListener;
208
  nsCOMPtr<nsISpeechRecognitionService> mRecognitionService;
209
210
  FSMState mCurrentState;
211
212
  Endpointer mEndpointer;
213
  uint32_t mEstimationSamples;
214
215
  uint32_t mAudioSamplesPerChunk;
216
217
  // buffer holds one chunk of mAudioSamplesPerChunk
218
  // samples before feeding it to mEndpointer
219
  RefPtr<SharedBuffer> mAudioSamplesBuffer;
220
  uint32_t mBufferedSamples;
221
222
  nsCOMPtr<nsITimer> mSpeechDetectionTimer;
223
  bool mAborted;
224
225
  nsString mLang;
226
227
  RefPtr<SpeechGrammarList> mSpeechGrammarList;
228
229
  // WebSpeechAPI (http://bit.ly/1gIl7DC) states:
230
  //
231
  // 1. Default value MUST be false
232
  // 2. If true, interim results SHOULD be returned
233
  // 3. If false, interim results MUST NOT be returned
234
  //
235
  // Pocketsphinx does not return interm results; so, defaulting
236
  // mInterimResults to false, then ignoring its subsequent value
237
  // is a conforming implementation.
238
  bool mInterimResults;
239
240
  // WebSpeechAPI (http://bit.ly/1JAiqeo) states:
241
  //
242
  // 1. Default value is 1
243
  // 2. Subsequent value is the "maximum number of SpeechRecognitionAlternatives per result"
244
  //
245
  // Pocketsphinx can only return at maximum a single SpeechRecognitionAlternative
246
  // per SpeechRecognitionResult. So defaulting mMaxAlternatives to 1, for all non
247
  // zero values ignoring mMaxAlternatives while for a 0 value returning no
248
  // SpeechRecognitionAlternative per result is a conforming implementation.
249
  uint32_t mMaxAlternatives;
250
251
  void ProcessTestEventRequest(nsISupports* aSubject, const nsAString& aEventName);
252
253
  const char* GetName(FSMState aId);
254
  const char* GetName(SpeechEvent* aId);
255
};
256
257
class SpeechEvent : public Runnable
258
{
259
public:
260
  SpeechEvent(SpeechRecognition* aRecognition,
261
              SpeechRecognition::EventType aType)
262
    : Runnable("dom::SpeechEvent")
263
    , mAudioSegment(0)
264
    , mRecognitionResultList(nullptr)
265
    , mError(nullptr)
266
    , mRecognition(aRecognition)
267
    , mType(aType)
268
    , mTrackRate(0)
269
0
  {
270
0
  }
271
272
  ~SpeechEvent();
273
274
  NS_IMETHOD Run() override;
275
  AudioSegment* mAudioSegment;
276
  RefPtr<SpeechRecognitionResultList> mRecognitionResultList; // TODO: make this a session being passed which also has index and stuff
277
  RefPtr<SpeechRecognitionError> mError;
278
279
  friend class SpeechRecognition;
280
private:
281
  SpeechRecognition* mRecognition;
282
283
  // for AUDIO_DATA events, keep a reference to the provider
284
  // of the data (i.e., the SpeechStreamListener) to ensure it
285
  // is kept alive (and keeps SpeechRecognition alive) until this
286
  // event gets processed.
287
  RefPtr<MediaStreamListener> mProvider;
288
  SpeechRecognition::EventType mType;
289
  TrackRate mTrackRate;
290
};
291
292
} // namespace dom
293
294
inline nsISupports*
295
ToSupports(dom::SpeechRecognition* aRec)
296
0
{
297
0
  return ToSupports(static_cast<DOMEventTargetHelper*>(aRec));
298
0
}
299
300
} // namespace mozilla
301
302
#endif