Coverage Report

Created: 2018-09-25 14:53

/src/mozilla-central/parser/html/nsHtml5StreamParser.h
Line
Count
Source (jump to first uncovered line)
1
/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2
/* This Source Code Form is subject to the terms of the Mozilla Public
3
 * License, v. 2.0. If a copy of the MPL was not distributed with this
4
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
5
6
#ifndef nsHtml5StreamParser_h
7
#define nsHtml5StreamParser_h
8
9
#include "nsAutoPtr.h"
10
#include "nsCOMPtr.h"
11
#include "nsICharsetDetectionObserver.h"
12
#include "nsHtml5MetaScanner.h"
13
#include "mozilla/Encoding.h"
14
#include "nsHtml5TreeOpExecutor.h"
15
#include "nsHtml5OwningUTF16Buffer.h"
16
#include "nsIInputStream.h"
17
#include "mozilla/Mutex.h"
18
#include "mozilla/UniquePtr.h"
19
#include "nsHtml5AtomTable.h"
20
#include "nsHtml5Speculation.h"
21
#include "nsISerialEventTarget.h"
22
#include "nsITimer.h"
23
#include "nsICharsetDetector.h"
24
#include "mozilla/dom/DocGroup.h"
25
26
class nsHtml5Parser;
27
28
0
#define NS_HTML5_STREAM_PARSER_READ_BUFFER_SIZE 1024
29
0
#define NS_HTML5_STREAM_PARSER_SNIFFING_BUFFER_SIZE 1024
30
31
enum eParserMode
32
{
33
  /**
34
   * Parse a document normally as HTML.
35
   */
36
  NORMAL,
37
38
  /**
39
   * View document as HTML source.
40
   */
41
  VIEW_SOURCE_HTML,
42
43
  /**
44
   * View document as XML source
45
   */
46
  VIEW_SOURCE_XML,
47
48
  /**
49
   * View document as plain text source
50
   */
51
  VIEW_SOURCE_PLAIN,
52
53
  /**
54
   * View document as plain text
55
   */
56
  PLAIN_TEXT,
57
58
  /**
59
   * Load as data (XHR)
60
   */
61
  LOAD_AS_DATA
62
};
63
64
enum eBomState
65
{
66
  /**
67
   * BOM sniffing hasn't started.
68
   */
69
  BOM_SNIFFING_NOT_STARTED = 0,
70
71
  /**
72
   * BOM sniffing is ongoing, and the first byte of an UTF-16LE BOM has been
73
   * seen.
74
   */
75
  SEEN_UTF_16_LE_FIRST_BYTE = 1,
76
77
  /**
78
   * BOM sniffing is ongoing, and the first byte of an UTF-16BE BOM has been
79
   * seen.
80
   */
81
  SEEN_UTF_16_BE_FIRST_BYTE = 2,
82
83
  /**
84
   * BOM sniffing is ongoing, and the first byte of an UTF-8 BOM has been
85
   * seen.
86
   */
87
  SEEN_UTF_8_FIRST_BYTE = 3,
88
89
  /**
90
   * BOM sniffing is ongoing, and the first and second bytes of an UTF-8 BOM
91
   * have been seen.
92
   */
93
  SEEN_UTF_8_SECOND_BYTE = 4,
94
95
  /**
96
   * BOM sniffing was started but is now over for whatever reason.
97
   */
98
  BOM_SNIFFING_OVER = 5
99
};
100
101
enum eHtml5StreamState
102
{
103
  STREAM_NOT_STARTED = 0,
104
  STREAM_BEING_READ = 1,
105
  STREAM_ENDED = 2
106
};
107
108
class nsHtml5StreamParser final : public nsICharsetDetectionObserver
109
{
110
  template<typename T>
111
  using NotNull = mozilla::NotNull<T>;
112
  using Encoding = mozilla::Encoding;
113
114
  friend class nsHtml5RequestStopper;
115
  friend class nsHtml5DataAvailable;
116
  friend class nsHtml5StreamParserContinuation;
117
  friend class nsHtml5TimerKungFu;
118
  friend class nsHtml5StreamParserPtr;
119
120
public:
121
  NS_DECL_CYCLE_COLLECTING_ISUPPORTS
122
  NS_DECL_CYCLE_COLLECTION_CLASS_AMBIGUOUS(nsHtml5StreamParser,
123
                                           nsICharsetDetectionObserver)
124
125
  nsHtml5StreamParser(nsHtml5TreeOpExecutor* aExecutor,
126
                      nsHtml5Parser* aOwner,
127
                      eParserMode aMode);
128
129
  // Methods that nsHtml5StreamListener calls
130
  nsresult CheckListenerChain();
131
132
  nsresult OnStartRequest(nsIRequest* aRequest, nsISupports* aContext);
133
134
  nsresult OnDataAvailable(nsIRequest* aRequest,
135
                           nsISupports* aContext,
136
                           nsIInputStream* aInStream,
137
                           uint64_t aSourceOffset,
138
                           uint32_t aLength);
139
140
  nsresult OnStopRequest(nsIRequest* aRequest,
141
                         nsISupports* aContext,
142
                         nsresult status);
143
144
  // nsICharsetDetectionObserver
145
  /**
146
   * Chardet calls this to report the detection result
147
   */
148
  NS_IMETHOD Notify(const char* aCharset, nsDetectionConfident aConf) override;
149
150
  // EncodingDeclarationHandler
151
  // https://hg.mozilla.org/projects/htmlparser/file/tip/src/nu/validator/htmlparser/common/EncodingDeclarationHandler.java
152
  /**
153
   * Tree builder uses this to report a late <meta charset>
154
   */
155
  bool internalEncodingDeclaration(nsHtml5String aEncoding);
156
157
  // Not from an external interface
158
159
  /**
160
   *  Call this method once you've created a parser, and want to instruct it
161
   *  about what charset to load
162
   *
163
   *  @param   aEncoding the charset of a document
164
   *  @param   aCharsetSource the source of the charset
165
   */
166
  inline void SetDocumentCharset(NotNull<const Encoding*> aEncoding,
167
                                 int32_t aSource)
168
0
  {
169
0
    MOZ_ASSERT(mStreamState == STREAM_NOT_STARTED,
170
0
               "SetDocumentCharset called too late.");
171
0
    NS_ASSERTION(NS_IsMainThread(), "Wrong thread!");
172
0
    mEncoding = aEncoding;
173
0
    mCharsetSource = aSource;
174
0
  }
175
176
  inline void SetObserver(nsIRequestObserver* aObserver)
177
0
  {
178
0
    NS_ASSERTION(NS_IsMainThread(), "Wrong thread!");
179
0
    mObserver = aObserver;
180
0
  }
181
182
  nsresult GetChannel(nsIChannel** aChannel);
183
184
  /**
185
   * The owner parser must call this after script execution
186
   * when no scripts are executing and the document.written
187
   * buffer has been exhausted.
188
   */
189
  void ContinueAfterScripts(nsHtml5Tokenizer* aTokenizer,
190
                            nsHtml5TreeBuilder* aTreeBuilder,
191
                            bool aLastWasCR);
192
193
  /**
194
   * Continues the stream parser if the charset switch failed.
195
   */
196
  void ContinueAfterFailedCharsetSwitch();
197
198
  void Terminate()
199
0
  {
200
0
    mozilla::MutexAutoLock autoLock(mTerminatedMutex);
201
0
    mTerminated = true;
202
0
  }
203
204
  void DropTimer();
205
206
  /**
207
   * Sets mEncoding and mCharsetSource appropriately for the XML View Source
208
   * case if aEncoding names a supported rough ASCII superset and sets
209
   * the mEncoding and mCharsetSource to the UTF-8 default otherwise.
210
   */
211
  void SetEncodingFromExpat(const char16_t* aEncoding);
212
213
  /**
214
   * Sets the URL for View Source title in case this parser ends up being
215
   * used for View Source. If aURL is a view-source: URL, takes the inner
216
   * URL. data: URLs are shown with an ellipsis instead of the actual data.
217
   */
218
  void SetViewSourceTitle(nsIURI* aURL);
219
220
private:
221
  virtual ~nsHtml5StreamParser();
222
223
#ifdef DEBUG
224
  bool IsParserThread() { return mEventTarget->IsOnCurrentThread(); }
225
#endif
226
227
  void MarkAsBroken(nsresult aRv);
228
229
  /**
230
   * Marks the stream parser as interrupted. If you ever add calls to this
231
   * method, be sure to review Uninterrupt usage very, very carefully to
232
   * avoid having a previous in-flight runnable cancel your Interrupt()
233
   * call on the other thread too soon.
234
   */
235
  void Interrupt()
236
0
  {
237
0
    mozilla::MutexAutoLock autoLock(mTerminatedMutex);
238
0
    mInterrupted = true;
239
0
  }
240
241
  void Uninterrupt()
242
0
  {
243
0
    NS_ASSERTION(IsParserThread(), "Wrong thread!");
244
0
    mTokenizerMutex.AssertCurrentThreadOwns();
245
0
    // Not acquiring mTerminatedMutex because mTokenizerMutex is already
246
0
    // held at this point and is already stronger.
247
0
    mInterrupted = false;
248
0
  }
249
250
  /**
251
   * Flushes the tree ops from the tree builder and disarms the flush
252
   * timer.
253
   */
254
  void FlushTreeOpsAndDisarmTimer();
255
256
  void ParseAvailableData();
257
258
  void DoStopRequest();
259
260
  void DoDataAvailable(const uint8_t* aBuffer, uint32_t aLength);
261
262
  static nsresult CopySegmentsToParser(nsIInputStream* aInStream,
263
                                       void* aClosure,
264
                                       const char* aFromSegment,
265
                                       uint32_t aToOffset,
266
                                       uint32_t aCount,
267
                                       uint32_t* aWriteCount);
268
269
  bool IsTerminatedOrInterrupted()
270
0
  {
271
0
    mozilla::MutexAutoLock autoLock(mTerminatedMutex);
272
0
    return mTerminated || mInterrupted;
273
0
  }
274
275
  bool IsTerminated()
276
0
  {
277
0
    mozilla::MutexAutoLock autoLock(mTerminatedMutex);
278
0
    return mTerminated;
279
0
  }
280
281
  /**
282
   * True when there is a Unicode decoder already
283
   */
284
0
  inline bool HasDecoder() { return !!mUnicodeDecoder; }
285
286
  /**
287
   * Push bytes from network when there is no Unicode decoder yet
288
   */
289
  nsresult SniffStreamBytes(const uint8_t* aFromSegment,
290
                            uint32_t aCount,
291
                            uint32_t* aWriteCount);
292
293
  /**
294
   * Push bytes from network when there is a Unicode decoder already
295
   */
296
  nsresult WriteStreamBytes(const uint8_t* aFromSegment,
297
                            uint32_t aCount,
298
                            uint32_t* aWriteCount);
299
300
  /**
301
   * Check whether every other byte in the sniffing buffer is zero.
302
   */
303
  void SniffBOMlessUTF16BasicLatin(const uint8_t* aFromSegment,
304
                                   uint32_t aCountToSniffingLimit);
305
306
  /**
307
   * <meta charset> scan failed. Try chardet if applicable. After this, the
308
   * the parser will have some encoding even if a last resolt fallback.
309
   *
310
   * @param aFromSegment The current network buffer or null if the sniffing
311
   *                     buffer is being flushed due to network stream ending.
312
   * @param aCount       The number of bytes in aFromSegment (ignored if
313
   *                     aFromSegment is null)
314
   * @param aWriteCount  Return value for how many bytes got read from the
315
   *                     buffer.
316
   * @param aCountToSniffingLimit The number of unfilled slots in
317
   *                              mSniffingBuffer
318
   */
319
  nsresult FinalizeSniffing(const uint8_t* aFromSegment,
320
                            uint32_t aCount,
321
                            uint32_t* aWriteCount,
322
                            uint32_t aCountToSniffingLimit);
323
324
  /**
325
   * Set up the Unicode decoder and write the sniffing buffer into it
326
   * followed by the current network buffer.
327
   *
328
   * @param aFromSegment The current network buffer or null if the sniffing
329
   *                     buffer is being flushed due to network stream ending.
330
   * @param aCount       The number of bytes in aFromSegment (ignored if
331
   *                     aFromSegment is null)
332
   * @param aWriteCount  Return value for how many bytes got read from the
333
   *                     buffer.
334
   */
335
  nsresult SetupDecodingAndWriteSniffingBufferAndCurrentSegment(
336
    const uint8_t* aFromSegment,
337
    uint32_t aCount,
338
    uint32_t* aWriteCount);
339
340
  /**
341
   * Initialize the Unicode decoder, mark the BOM as the source and
342
   * drop the sniffer.
343
   *
344
   * @param aDecoderCharsetName The name for the decoder's charset
345
   *                            (UTF-16BE, UTF-16LE or UTF-8; the BOM has
346
   *                            been swallowed)
347
   */
348
  nsresult SetupDecodingFromBom(NotNull<const Encoding*> aEncoding);
349
350
  /**
351
   * Become confident or resolve and encoding name to its preferred form.
352
   * @param aEncoding the value of an internal encoding decl. Acts as an
353
   *                  out param, too, when the method returns true.
354
   * @return true if the parser needs to start using the new value of
355
   *         aEncoding and false if the parser became confident or if
356
   *         the encoding name did not specify a usable encoding
357
   */
358
  const Encoding* PreferredForInternalEncodingDecl(const nsACString& aEncoding);
359
360
  /**
361
   * Callback for mFlushTimer.
362
   */
363
  static void TimerCallback(nsITimer* aTimer, void* aClosure);
364
365
  /**
366
   * Parser thread entry point for (maybe) flushing the ops and posting
367
   * a flush runnable back on the main thread.
368
   */
369
  void TimerFlush();
370
371
  /**
372
   * Called when speculation fails.
373
   */
374
0
  void MaybeDisableFutureSpeculation() { mSpeculationFailureCount++; }
375
376
  /**
377
   * Used to check whether we're getting too many speculation failures and
378
   * should just stop trying.  The 100 is picked pretty randomly to be not too
379
   * small (so most pages are not affected) but small enough that we don't end
380
   * up with failed speculations over and over in pathological cases.
381
   */
382
0
  bool IsSpeculationEnabled() { return mSpeculationFailureCount < 100; }
383
384
  /**
385
   * Dispatch an event to a Quantum DOM main thread-ish thread.
386
   * (Not the parser thread.)
387
   */
388
  nsresult DispatchToMain(already_AddRefed<nsIRunnable>&& aRunnable);
389
390
  nsCOMPtr<nsIRequest> mRequest;
391
  nsCOMPtr<nsIRequestObserver> mObserver;
392
393
  /**
394
   * The document title to use if this turns out to be a View Source parser.
395
   */
396
  nsCString mViewSourceTitle;
397
398
  /**
399
   * The Unicode decoder
400
   */
401
  mozilla::UniquePtr<mozilla::Decoder> mUnicodeDecoder;
402
403
  /**
404
   * The buffer for sniffing the character encoding
405
   */
406
  mozilla::UniquePtr<uint8_t[]> mSniffingBuffer;
407
408
  /**
409
   * The number of meaningful bytes in mSniffingBuffer
410
   */
411
  uint32_t mSniffingLength;
412
413
  /**
414
   * BOM sniffing state
415
   */
416
  eBomState mBomState;
417
418
  /**
419
   * <meta> prescan implementation
420
   */
421
  nsAutoPtr<nsHtml5MetaScanner> mMetaScanner;
422
423
  // encoding-related stuff
424
  /**
425
   * The source (confidence) of the character encoding in use
426
   */
427
  int32_t mCharsetSource;
428
429
  /**
430
   * The character encoding in use
431
   */
432
  NotNull<const Encoding*> mEncoding;
433
434
  /**
435
   * Whether reparse is forbidden
436
   */
437
  bool mReparseForbidden;
438
439
  // Portable parser objects
440
  /**
441
   * The first buffer in the pending UTF-16 buffer queue
442
   */
443
  RefPtr<nsHtml5OwningUTF16Buffer> mFirstBuffer;
444
445
  /**
446
   * The last buffer in the pending UTF-16 buffer queue
447
   */
448
  nsHtml5OwningUTF16Buffer*
449
    mLastBuffer; // weak ref; always points to
450
                 // a buffer of the size NS_HTML5_STREAM_PARSER_READ_BUFFER_SIZE
451
452
  /**
453
   * The tree operation executor
454
   */
455
  nsHtml5TreeOpExecutor* mExecutor;
456
457
  /**
458
   * The same as mExecutor->mDocument->mDocGroup.
459
   */
460
  RefPtr<mozilla::dom::DocGroup> mDocGroup;
461
462
  /**
463
   * The HTML5 tree builder
464
   */
465
  nsAutoPtr<nsHtml5TreeBuilder> mTreeBuilder;
466
467
  /**
468
   * The HTML5 tokenizer
469
   */
470
  nsAutoPtr<nsHtml5Tokenizer> mTokenizer;
471
472
  /**
473
   * Makes sure the main thread can't mess the tokenizer state while it's
474
   * tokenizing. This mutex also protects the current speculation.
475
   */
476
  mozilla::Mutex mTokenizerMutex;
477
478
  /**
479
   * The scoped atom table
480
   */
481
  nsHtml5AtomTable mAtomTable;
482
483
  /**
484
   * The owner parser.
485
   */
486
  RefPtr<nsHtml5Parser> mOwner;
487
488
  /**
489
   * Whether the last character tokenized was a carriage return (for CRLF)
490
   */
491
  bool mLastWasCR;
492
493
  /**
494
   * For tracking stream life cycle
495
   */
496
  eHtml5StreamState mStreamState;
497
498
  /**
499
   * Whether we are speculating.
500
   */
501
  bool mSpeculating;
502
503
  /**
504
   * Whether the tokenizer has reached EOF. (Reset when stream rewinded.)
505
   */
506
  bool mAtEOF;
507
508
  /**
509
   * The speculations. The mutex protects the nsTArray itself.
510
   * To access the queue of current speculation, mTokenizerMutex must be
511
   * obtained.
512
   * The current speculation is the last element
513
   */
514
  nsTArray<nsAutoPtr<nsHtml5Speculation>> mSpeculations;
515
  mozilla::Mutex mSpeculationMutex;
516
517
  /**
518
   * Number of times speculation has failed for this parser.
519
   */
520
  uint32_t mSpeculationFailureCount;
521
522
  /**
523
   * True to terminate early; protected by mTerminatedMutex
524
   */
525
  bool mTerminated;
526
  bool mInterrupted;
527
  mozilla::Mutex mTerminatedMutex;
528
529
  /**
530
   * The thread this stream parser runs on.
531
   */
532
  nsCOMPtr<nsISerialEventTarget> mEventTarget;
533
534
  nsCOMPtr<nsIRunnable> mExecutorFlusher;
535
536
  nsCOMPtr<nsIRunnable> mLoadFlusher;
537
538
  /**
539
   * The chardet instance if chardet is enabled.
540
   */
541
  nsCOMPtr<nsICharsetDetector> mChardet;
542
543
  /**
544
   * If false, don't push data to chardet.
545
   */
546
  bool mFeedChardet;
547
548
  /**
549
   * Whether the initial charset source was kCharsetFromParentFrame
550
   */
551
  bool mInitialEncodingWasFromParentFrame;
552
553
  bool mHasHadErrors;
554
555
  /**
556
   * Timer for flushing tree ops once in a while when not speculating.
557
   */
558
  nsCOMPtr<nsITimer> mFlushTimer;
559
560
  /**
561
   * Mutex for protecting access to mFlushTimer (but not for the two
562
   * mFlushTimerFoo booleans below).
563
   */
564
  mozilla::Mutex mFlushTimerMutex;
565
566
  /**
567
   * Keeps track whether mFlushTimer has been armed. Unfortunately,
568
   * nsITimer doesn't enable querying this from the timer itself.
569
   */
570
  bool mFlushTimerArmed;
571
572
  /**
573
   * False initially and true after the timer has fired at least once.
574
   */
575
  bool mFlushTimerEverFired;
576
577
  /**
578
   * Whether the parser is doing a normal parse, view source or plain text.
579
   */
580
  eParserMode mMode;
581
};
582
583
#endif // nsHtml5StreamParser_h