/src/mozilla-central/dom/media/webspeech/recognition/SpeechRecognition.h

Source (jump to first uncovered line)
/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* vim:set ts=2 sw=2 sts=2 et cindent: */
/* This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

#ifndef mozilla_dom_SpeechRecognition_h
#define mozilla_dom_SpeechRecognition_h

#include "mozilla/Attributes.h"
#include "mozilla/DOMEventTargetHelper.h"
#include "nsCOMPtr.h"
#include "nsString.h"
#include "nsWrapperCache.h"
#include "nsTArray.h"
#include "js/TypeDecls.h"

#include "DOMMediaStream.h"
#include "nsIDOMNavigatorUserMedia.h"
#include "nsITimer.h"
#include "MediaStreamGraph.h"
#include "AudioSegment.h"
#include "mozilla/WeakPtr.h"

#include "SpeechGrammarList.h"
#include "SpeechRecognitionResultList.h"
#include "SpeechStreamListener.h"
#include "nsISpeechRecognitionService.h"
#include "endpointer.h"

#include "mozilla/dom/BindingDeclarations.h"
#include "mozilla/dom/SpeechRecognitionError.h"

namespace mozilla {

class DOMMediaStream;

namespace dom {

#define SPEECH_RECOGNITION_TEST_EVENT_REQUEST_TOPIC "SpeechRecognitionTest:RequestEvent"
#define SPEECH_RECOGNITION_TEST_END_TOPIC "SpeechRecognitionTest:End"

class GlobalObject;
class SpeechEvent;

LogModule* GetSpeechRecognitionLog();
#define SR_LOG(...) MOZ_LOG(GetSpeechRecognitionLog(), mozilla::LogLevel::Debug, (__VA_ARGS__))

class SpeechRecognition final : public DOMEventTargetHelper,
                                public nsIObserver,
                                public SupportsWeakPtr<SpeechRecognition>
{
public:
  MOZ_DECLARE_WEAKREFERENCE_TYPENAME(SpeechRecognition)
  explicit SpeechRecognition(nsPIDOMWindowInner* aOwnerWindow);

  NS_DECL_ISUPPORTS_INHERITED
  NS_DECL_CYCLE_COLLECTION_CLASS_INHERITED(SpeechRecognition, DOMEventTargetHelper)

  NS_DECL_NSIOBSERVER

  nsISupports* GetParentObject() const;

  JSObject* WrapObject(JSContext* aCx, JS::Handle<JSObject*> aGivenProto) override;

  static bool IsAuthorized(JSContext* aCx, JSObject* aGlobal);

  static already_AddRefed<SpeechRecognition>
  Constructor(const GlobalObject& aGlobal, ErrorResult& aRv);

  already_AddRefed<SpeechGrammarList> Grammars() const;

  void SetGrammars(mozilla::dom::SpeechGrammarList& aArg);

  void GetLang(nsString& aRetVal) const;

  void SetLang(const nsAString& aArg);

  bool GetContinuous(ErrorResult& aRv) const;

  void SetContinuous(bool aArg, ErrorResult& aRv);

  bool InterimResults() const;

  void SetInterimResults(bool aArg);

  uint32_t MaxAlternatives() const;

  void SetMaxAlternatives(uint32_t aArg);

  void GetServiceURI(nsString& aRetVal, ErrorResult& aRv) const;

  void SetServiceURI(const nsAString& aArg, ErrorResult& aRv);

  void Start(const Optional<NonNull<DOMMediaStream>>& aStream,
             CallerType aCallerType, ErrorResult& aRv);

  void Stop();

  void Abort();

  IMPL_EVENT_HANDLER(audiostart)
  IMPL_EVENT_HANDLER(soundstart)
  IMPL_EVENT_HANDLER(speechstart)
  IMPL_EVENT_HANDLER(speechend)
  IMPL_EVENT_HANDLER(soundend)
  IMPL_EVENT_HANDLER(audioend)
  IMPL_EVENT_HANDLER(result)
  IMPL_EVENT_HANDLER(nomatch)
  IMPL_EVENT_HANDLER(error)
  IMPL_EVENT_HANDLER(start)
  IMPL_EVENT_HANDLER(end)

  enum EventType {
    EVENT_START,
    EVENT_STOP,
    EVENT_ABORT,
    EVENT_AUDIO_DATA,
    EVENT_AUDIO_ERROR,
    EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT,
    EVENT_RECOGNITIONSERVICE_FINAL_RESULT,
    EVENT_RECOGNITIONSERVICE_ERROR,
    EVENT_COUNT
  };

  void DispatchError(EventType aErrorType, SpeechRecognitionErrorCode aErrorCode, const nsAString& aMessage);
  uint32_t FillSamplesBuffer(const int16_t* aSamples, uint32_t aSampleCount);
  uint32_t SplitSamplesBuffer(const int16_t* aSamplesBuffer, uint32_t aSampleCount, nsTArray<RefPtr<SharedBuffer>>& aResult);
  AudioSegment* CreateAudioSegment(nsTArray<RefPtr<SharedBuffer>>& aChunks);
  void FeedAudioData(already_AddRefed<SharedBuffer> aSamples, uint32_t aDuration, MediaStreamListener* aProvider, TrackRate aTrackRate);

  friend class SpeechEvent;
private:
  virtual ~SpeechRecognition() {};

  enum FSMState {
    STATE_IDLE,
    STATE_STARTING,
    STATE_ESTIMATING,
    STATE_WAITING_FOR_SPEECH,
    STATE_RECOGNIZING,
    STATE_WAITING_FOR_RESULT,
    STATE_COUNT
  };

  void SetState(FSMState state);
  bool StateBetween(FSMState begin, FSMState end);

  bool SetRecognitionService(ErrorResult& aRv);
  bool ValidateAndSetGrammarList(ErrorResult& aRv);

  class GetUserMediaSuccessCallback : public nsIDOMGetUserMediaSuccessCallback
  {
  public:
    NS_DECL_ISUPPORTS
    NS_DECL_NSIDOMGETUSERMEDIASUCCESSCALLBACK

    explicit GetUserMediaSuccessCallback(SpeechRecognition* aRecognition)
      : mRecognition(aRecognition)
    {}

  private:
    virtual ~GetUserMediaSuccessCallback() {}

    RefPtr<SpeechRecognition> mRecognition;
  };

  class GetUserMediaErrorCallback : public nsIDOMGetUserMediaErrorCallback
  {
  public:
    NS_DECL_ISUPPORTS
    NS_DECL_NSIDOMGETUSERMEDIAERRORCALLBACK

    explicit GetUserMediaErrorCallback(SpeechRecognition* aRecognition)
      : mRecognition(aRecognition)
    {}

  private:
    virtual ~GetUserMediaErrorCallback() {}

    RefPtr<SpeechRecognition> mRecognition;
  };

  NS_IMETHOD StartRecording(DOMMediaStream* aDOMStream);
  NS_IMETHOD StopRecording();

  uint32_t ProcessAudioSegment(AudioSegment* aSegment, TrackRate aTrackRate);
  void NotifyError(SpeechEvent* aEvent);

  void ProcessEvent(SpeechEvent* aEvent);
  void Transition(SpeechEvent* aEvent);

  void Reset();
  void ResetAndEnd();
  void WaitForAudioData(SpeechEvent* aEvent);
  void StartedAudioCapture(SpeechEvent* aEvent);
  void StopRecordingAndRecognize(SpeechEvent* aEvent);
  void WaitForEstimation(SpeechEvent* aEvent);
  void DetectSpeech(SpeechEvent* aEvent);
  void WaitForSpeechEnd(SpeechEvent* aEvent);
  void NotifyFinalResult(SpeechEvent* aEvent);
  void DoNothing(SpeechEvent* aEvent);
  void AbortSilently(SpeechEvent* aEvent);
  void AbortError(SpeechEvent* aEvent);

  RefPtr<DOMMediaStream> mDOMStream;
  RefPtr<SpeechStreamListener> mSpeechListener;
  nsCOMPtr<nsISpeechRecognitionService> mRecognitionService;

  FSMState mCurrentState;

  Endpointer mEndpointer;
  uint32_t mEstimationSamples;

  uint32_t mAudioSamplesPerChunk;

  // buffer holds one chunk of mAudioSamplesPerChunk
  // samples before feeding it to mEndpointer
  RefPtr<SharedBuffer> mAudioSamplesBuffer;
  uint32_t mBufferedSamples;

  nsCOMPtr<nsITimer> mSpeechDetectionTimer;
  bool mAborted;

  nsString mLang;

  RefPtr<SpeechGrammarList> mSpeechGrammarList;

  // WebSpeechAPI (http://bit.ly/1gIl7DC) states:
  //
  // 1. Default value MUST be false
  // 2. If true, interim results SHOULD be returned
  // 3. If false, interim results MUST NOT be returned
  //
  // Pocketsphinx does not return interm results; so, defaulting
  // mInterimResults to false, then ignoring its subsequent value
  // is a conforming implementation.
  bool mInterimResults;

  // WebSpeechAPI (http://bit.ly/1JAiqeo) states:
  //
  // 1. Default value is 1
  // 2. Subsequent value is the "maximum number of SpeechRecognitionAlternatives per result"
  //
  // Pocketsphinx can only return at maximum a single SpeechRecognitionAlternative
  // per SpeechRecognitionResult. So defaulting mMaxAlternatives to 1, for all non
  // zero values ignoring mMaxAlternatives while for a 0 value returning no
  // SpeechRecognitionAlternative per result is a conforming implementation.
  uint32_t mMaxAlternatives;

  void ProcessTestEventRequest(nsISupports* aSubject, const nsAString& aEventName);

  const char* GetName(FSMState aId);
  const char* GetName(SpeechEvent* aId);
};

class SpeechEvent : public Runnable
{
public:
  SpeechEvent(SpeechRecognition* aRecognition,
              SpeechRecognition::EventType aType)
    : Runnable("dom::SpeechEvent")
    , mAudioSegment(0)
    , mRecognitionResultList(nullptr)
    , mError(nullptr)
    , mRecognition(aRecognition)
    , mType(aType)
    , mTrackRate(0)
  {
  }

  ~SpeechEvent();

  NS_IMETHOD Run() override;
  AudioSegment* mAudioSegment;
  RefPtr<SpeechRecognitionResultList> mRecognitionResultList; // TODO: make this a session being passed which also has index and stuff
  RefPtr<SpeechRecognitionError> mError;

  friend class SpeechRecognition;
private:
  SpeechRecognition* mRecognition;

  // for AUDIO_DATA events, keep a reference to the provider
  // of the data (i.e., the SpeechStreamListener) to ensure it
  // is kept alive (and keeps SpeechRecognition alive) until this
  // event gets processed.
  RefPtr<MediaStreamListener> mProvider;
  SpeechRecognition::EventType mType;
  TrackRate mTrackRate;
};

} // namespace dom

inline nsISupports*
ToSupports(dom::SpeechRecognition* aRec)
{
  return ToSupports(static_cast<DOMEventTargetHelper*>(aRec));
}

} // namespace mozilla

#endif

Coverage Report

Created: 2018-09-25 14:53

Line	Count	Source (jump to first uncovered line)
1		/* -- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -- */
2		/* vim:set ts=2 sw=2 sts=2 et cindent: */
3		/* This Source Code Form is subject to the terms of the Mozilla Public
4		* License, v. 2.0. If a copy of the MPL was not distributed with this
5		* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
6
7		#ifndef mozilla_dom_SpeechRecognition_h
8		#define mozilla_dom_SpeechRecognition_h
9
10		#include "mozilla/Attributes.h"
11		#include "mozilla/DOMEventTargetHelper.h"
12		#include "nsCOMPtr.h"
13		#include "nsString.h"
14		#include "nsWrapperCache.h"
15		#include "nsTArray.h"
16		#include "js/TypeDecls.h"
17
18		#include "DOMMediaStream.h"
19		#include "nsIDOMNavigatorUserMedia.h"
20		#include "nsITimer.h"
21		#include "MediaStreamGraph.h"
22		#include "AudioSegment.h"
23		#include "mozilla/WeakPtr.h"
24
25		#include "SpeechGrammarList.h"
26		#include "SpeechRecognitionResultList.h"
27		#include "SpeechStreamListener.h"
28		#include "nsISpeechRecognitionService.h"
29		#include "endpointer.h"
30
31		#include "mozilla/dom/BindingDeclarations.h"
32		#include "mozilla/dom/SpeechRecognitionError.h"
33
34		namespace mozilla {
35
36		class DOMMediaStream;
37
38		namespace dom {
39
40	0	#define SPEECH_RECOGNITION_TEST_EVENT_REQUEST_TOPIC "SpeechRecognitionTest:RequestEvent"
41	0	#define SPEECH_RECOGNITION_TEST_END_TOPIC "SpeechRecognitionTest:End"
42
43		class GlobalObject;
44		class SpeechEvent;
45
46		LogModule* GetSpeechRecognitionLog();
47		#define SR_LOG(...) MOZ_LOG(GetSpeechRecognitionLog(), mozilla::LogLevel::Debug, (__VA_ARGS__))
48
49		class SpeechRecognition final : public DOMEventTargetHelper,
50		public nsIObserver,
51		public SupportsWeakPtr<SpeechRecognition>
52		{
53		public:
54		MOZ_DECLARE_WEAKREFERENCE_TYPENAME(SpeechRecognition)
55		explicit SpeechRecognition(nsPIDOMWindowInner* aOwnerWindow);
56
57		NS_DECL_ISUPPORTS_INHERITED
58		NS_DECL_CYCLE_COLLECTION_CLASS_INHERITED(SpeechRecognition, DOMEventTargetHelper)
59
60		NS_DECL_NSIOBSERVER
61
62		nsISupports* GetParentObject() const;
63
64		JSObject* WrapObject(JSContext* aCx, JS::Handle<JSObject*> aGivenProto) override;
65
66		static bool IsAuthorized(JSContext* aCx, JSObject* aGlobal);
67
68		static already_AddRefed<SpeechRecognition>
69		Constructor(const GlobalObject& aGlobal, ErrorResult& aRv);
70
71		already_AddRefed<SpeechGrammarList> Grammars() const;
72
73		void SetGrammars(mozilla::dom::SpeechGrammarList& aArg);
74
75		void GetLang(nsString& aRetVal) const;
76
77		void SetLang(const nsAString& aArg);
78
79		bool GetContinuous(ErrorResult& aRv) const;
80
81		void SetContinuous(bool aArg, ErrorResult& aRv);
82
83		bool InterimResults() const;
84
85		void SetInterimResults(bool aArg);
86
87		uint32_t MaxAlternatives() const;
88
89		void SetMaxAlternatives(uint32_t aArg);
90
91		void GetServiceURI(nsString& aRetVal, ErrorResult& aRv) const;
92
93		void SetServiceURI(const nsAString& aArg, ErrorResult& aRv);
94
95		void Start(const Optional<NonNull<DOMMediaStream>>& aStream,
96		CallerType aCallerType, ErrorResult& aRv);
97
98		void Stop();
99
100		void Abort();
101
102		IMPL_EVENT_HANDLER(audiostart)
103		IMPL_EVENT_HANDLER(soundstart)
104		IMPL_EVENT_HANDLER(speechstart)
105		IMPL_EVENT_HANDLER(speechend)
106		IMPL_EVENT_HANDLER(soundend)
107		IMPL_EVENT_HANDLER(audioend)
108		IMPL_EVENT_HANDLER(result)
109		IMPL_EVENT_HANDLER(nomatch)
110		IMPL_EVENT_HANDLER(error)
111		IMPL_EVENT_HANDLER(start)
112		IMPL_EVENT_HANDLER(end)
113
114		enum EventType {
115		EVENT_START,
116		EVENT_STOP,
117		EVENT_ABORT,
118		EVENT_AUDIO_DATA,
119		EVENT_AUDIO_ERROR,
120		EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT,
121		EVENT_RECOGNITIONSERVICE_FINAL_RESULT,
122		EVENT_RECOGNITIONSERVICE_ERROR,
123		EVENT_COUNT
124		};
125
126		void DispatchError(EventType aErrorType, SpeechRecognitionErrorCode aErrorCode, const nsAString& aMessage);
127		uint32_t FillSamplesBuffer(const int16_t* aSamples, uint32_t aSampleCount);
128		uint32_t SplitSamplesBuffer(const int16_t* aSamplesBuffer, uint32_t aSampleCount, nsTArray<RefPtr<SharedBuffer>>& aResult);
129		AudioSegment* CreateAudioSegment(nsTArray<RefPtr<SharedBuffer>>& aChunks);
130		void FeedAudioData(already_AddRefed<SharedBuffer> aSamples, uint32_t aDuration, MediaStreamListener* aProvider, TrackRate aTrackRate);
131
132		friend class SpeechEvent;
133		private:
134	0	virtual ~SpeechRecognition() {};
135
136		enum FSMState {
137		STATE_IDLE,
138		STATE_STARTING,
139		STATE_ESTIMATING,
140		STATE_WAITING_FOR_SPEECH,
141		STATE_RECOGNIZING,
142		STATE_WAITING_FOR_RESULT,
143		STATE_COUNT
144		};
145
146		void SetState(FSMState state);
147		bool StateBetween(FSMState begin, FSMState end);
148
149		bool SetRecognitionService(ErrorResult& aRv);
150		bool ValidateAndSetGrammarList(ErrorResult& aRv);
151
152		class GetUserMediaSuccessCallback : public nsIDOMGetUserMediaSuccessCallback
153		{
154		public:
155		NS_DECL_ISUPPORTS
156		NS_DECL_NSIDOMGETUSERMEDIASUCCESSCALLBACK
157
158		explicit GetUserMediaSuccessCallback(SpeechRecognition* aRecognition)
159		: mRecognition(aRecognition)
160	0	{}
161
162		private:
163	0	virtual ~GetUserMediaSuccessCallback() {}
164
165		RefPtr<SpeechRecognition> mRecognition;
166		};
167
168		class GetUserMediaErrorCallback : public nsIDOMGetUserMediaErrorCallback
169		{
170		public:
171		NS_DECL_ISUPPORTS
172		NS_DECL_NSIDOMGETUSERMEDIAERRORCALLBACK
173
174		explicit GetUserMediaErrorCallback(SpeechRecognition* aRecognition)
175		: mRecognition(aRecognition)
176	0	{}
177
178		private:
179	0	virtual ~GetUserMediaErrorCallback() {}
180
181		RefPtr<SpeechRecognition> mRecognition;
182		};
183
184		NS_IMETHOD StartRecording(DOMMediaStream* aDOMStream);
185		NS_IMETHOD StopRecording();
186
187		uint32_t ProcessAudioSegment(AudioSegment* aSegment, TrackRate aTrackRate);
188		void NotifyError(SpeechEvent* aEvent);
189
190		void ProcessEvent(SpeechEvent* aEvent);
191		void Transition(SpeechEvent* aEvent);
192
193		void Reset();
194		void ResetAndEnd();
195		void WaitForAudioData(SpeechEvent* aEvent);
196		void StartedAudioCapture(SpeechEvent* aEvent);
197		void StopRecordingAndRecognize(SpeechEvent* aEvent);
198		void WaitForEstimation(SpeechEvent* aEvent);
199		void DetectSpeech(SpeechEvent* aEvent);
200		void WaitForSpeechEnd(SpeechEvent* aEvent);
201		void NotifyFinalResult(SpeechEvent* aEvent);
202		void DoNothing(SpeechEvent* aEvent);
203		void AbortSilently(SpeechEvent* aEvent);
204		void AbortError(SpeechEvent* aEvent);
205
206		RefPtr<DOMMediaStream> mDOMStream;
207		RefPtr<SpeechStreamListener> mSpeechListener;
208		nsCOMPtr<nsISpeechRecognitionService> mRecognitionService;
209
210		FSMState mCurrentState;
211
212		Endpointer mEndpointer;
213		uint32_t mEstimationSamples;
214
215		uint32_t mAudioSamplesPerChunk;
216
217		// buffer holds one chunk of mAudioSamplesPerChunk
218		// samples before feeding it to mEndpointer
219		RefPtr<SharedBuffer> mAudioSamplesBuffer;
220		uint32_t mBufferedSamples;
221
222		nsCOMPtr<nsITimer> mSpeechDetectionTimer;
223		bool mAborted;
224
225		nsString mLang;
226
227		RefPtr<SpeechGrammarList> mSpeechGrammarList;
228
229		// WebSpeechAPI (http://bit.ly/1gIl7DC) states:
230		//
231		// 1. Default value MUST be false
232		// 2. If true, interim results SHOULD be returned
233		// 3. If false, interim results MUST NOT be returned
234		//
235		// Pocketsphinx does not return interm results; so, defaulting
236		// mInterimResults to false, then ignoring its subsequent value
237		// is a conforming implementation.
238		bool mInterimResults;
239
240		// WebSpeechAPI (http://bit.ly/1JAiqeo) states:
241		//
242		// 1. Default value is 1
243		// 2. Subsequent value is the "maximum number of SpeechRecognitionAlternatives per result"
244		//
245		// Pocketsphinx can only return at maximum a single SpeechRecognitionAlternative
246		// per SpeechRecognitionResult. So defaulting mMaxAlternatives to 1, for all non
247		// zero values ignoring mMaxAlternatives while for a 0 value returning no
248		// SpeechRecognitionAlternative per result is a conforming implementation.
249		uint32_t mMaxAlternatives;
250
251		void ProcessTestEventRequest(nsISupports* aSubject, const nsAString& aEventName);
252
253		const char* GetName(FSMState aId);
254		const char* GetName(SpeechEvent* aId);
255		};
256
257		class SpeechEvent : public Runnable
258		{
259		public:
260		SpeechEvent(SpeechRecognition* aRecognition,
261		SpeechRecognition::EventType aType)
262		: Runnable("dom::SpeechEvent")
263		, mAudioSegment(0)
264		, mRecognitionResultList(nullptr)
265		, mError(nullptr)
266		, mRecognition(aRecognition)
267		, mType(aType)
268		, mTrackRate(0)
269	0	{
270	0	}
271
272		~SpeechEvent();
273
274		NS_IMETHOD Run() override;
275		AudioSegment* mAudioSegment;
276		RefPtr<SpeechRecognitionResultList> mRecognitionResultList; // TODO: make this a session being passed which also has index and stuff
277		RefPtr<SpeechRecognitionError> mError;
278
279		friend class SpeechRecognition;
280		private:
281		SpeechRecognition* mRecognition;
282
283		// for AUDIO_DATA events, keep a reference to the provider
284		// of the data (i.e., the SpeechStreamListener) to ensure it
285		// is kept alive (and keeps SpeechRecognition alive) until this
286		// event gets processed.
287		RefPtr<MediaStreamListener> mProvider;
288		SpeechRecognition::EventType mType;
289		TrackRate mTrackRate;
290		};
291
292		} // namespace dom
293
294		inline nsISupports*
295		ToSupports(dom::SpeechRecognition* aRec)
296	0	{
297	0	return ToSupports(static_cast<DOMEventTargetHelper*>(aRec));
298	0	}
299
300		} // namespace mozilla
301
302		#endif