/src/mozilla-central/parser/html/nsHtml5StreamParser.h
Line | Count | Source (jump to first uncovered line) |
1 | | /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ |
2 | | /* This Source Code Form is subject to the terms of the Mozilla Public |
3 | | * License, v. 2.0. If a copy of the MPL was not distributed with this |
4 | | * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ |
5 | | |
6 | | #ifndef nsHtml5StreamParser_h |
7 | | #define nsHtml5StreamParser_h |
8 | | |
9 | | #include "nsAutoPtr.h" |
10 | | #include "nsCOMPtr.h" |
11 | | #include "nsICharsetDetectionObserver.h" |
12 | | #include "nsHtml5MetaScanner.h" |
13 | | #include "mozilla/Encoding.h" |
14 | | #include "nsHtml5TreeOpExecutor.h" |
15 | | #include "nsHtml5OwningUTF16Buffer.h" |
16 | | #include "nsIInputStream.h" |
17 | | #include "mozilla/Mutex.h" |
18 | | #include "mozilla/UniquePtr.h" |
19 | | #include "nsHtml5AtomTable.h" |
20 | | #include "nsHtml5Speculation.h" |
21 | | #include "nsISerialEventTarget.h" |
22 | | #include "nsITimer.h" |
23 | | #include "nsICharsetDetector.h" |
24 | | #include "mozilla/dom/DocGroup.h" |
25 | | |
26 | | class nsHtml5Parser; |
27 | | |
28 | 0 | #define NS_HTML5_STREAM_PARSER_READ_BUFFER_SIZE 1024 |
29 | 0 | #define NS_HTML5_STREAM_PARSER_SNIFFING_BUFFER_SIZE 1024 |
30 | | |
31 | | enum eParserMode |
32 | | { |
33 | | /** |
34 | | * Parse a document normally as HTML. |
35 | | */ |
36 | | NORMAL, |
37 | | |
38 | | /** |
39 | | * View document as HTML source. |
40 | | */ |
41 | | VIEW_SOURCE_HTML, |
42 | | |
43 | | /** |
44 | | * View document as XML source |
45 | | */ |
46 | | VIEW_SOURCE_XML, |
47 | | |
48 | | /** |
49 | | * View document as plain text source |
50 | | */ |
51 | | VIEW_SOURCE_PLAIN, |
52 | | |
53 | | /** |
54 | | * View document as plain text |
55 | | */ |
56 | | PLAIN_TEXT, |
57 | | |
58 | | /** |
59 | | * Load as data (XHR) |
60 | | */ |
61 | | LOAD_AS_DATA |
62 | | }; |
63 | | |
64 | | enum eBomState |
65 | | { |
66 | | /** |
67 | | * BOM sniffing hasn't started. |
68 | | */ |
69 | | BOM_SNIFFING_NOT_STARTED = 0, |
70 | | |
71 | | /** |
72 | | * BOM sniffing is ongoing, and the first byte of an UTF-16LE BOM has been |
73 | | * seen. |
74 | | */ |
75 | | SEEN_UTF_16_LE_FIRST_BYTE = 1, |
76 | | |
77 | | /** |
78 | | * BOM sniffing is ongoing, and the first byte of an UTF-16BE BOM has been |
79 | | * seen. |
80 | | */ |
81 | | SEEN_UTF_16_BE_FIRST_BYTE = 2, |
82 | | |
83 | | /** |
84 | | * BOM sniffing is ongoing, and the first byte of an UTF-8 BOM has been |
85 | | * seen. |
86 | | */ |
87 | | SEEN_UTF_8_FIRST_BYTE = 3, |
88 | | |
89 | | /** |
90 | | * BOM sniffing is ongoing, and the first and second bytes of an UTF-8 BOM |
91 | | * have been seen. |
92 | | */ |
93 | | SEEN_UTF_8_SECOND_BYTE = 4, |
94 | | |
95 | | /** |
96 | | * BOM sniffing was started but is now over for whatever reason. |
97 | | */ |
98 | | BOM_SNIFFING_OVER = 5 |
99 | | }; |
100 | | |
101 | | enum eHtml5StreamState |
102 | | { |
103 | | STREAM_NOT_STARTED = 0, |
104 | | STREAM_BEING_READ = 1, |
105 | | STREAM_ENDED = 2 |
106 | | }; |
107 | | |
108 | | class nsHtml5StreamParser final : public nsICharsetDetectionObserver |
109 | | { |
110 | | template<typename T> |
111 | | using NotNull = mozilla::NotNull<T>; |
112 | | using Encoding = mozilla::Encoding; |
113 | | |
114 | | friend class nsHtml5RequestStopper; |
115 | | friend class nsHtml5DataAvailable; |
116 | | friend class nsHtml5StreamParserContinuation; |
117 | | friend class nsHtml5TimerKungFu; |
118 | | friend class nsHtml5StreamParserPtr; |
119 | | |
120 | | public: |
121 | | NS_DECL_CYCLE_COLLECTING_ISUPPORTS |
122 | | NS_DECL_CYCLE_COLLECTION_CLASS_AMBIGUOUS(nsHtml5StreamParser, |
123 | | nsICharsetDetectionObserver) |
124 | | |
125 | | nsHtml5StreamParser(nsHtml5TreeOpExecutor* aExecutor, |
126 | | nsHtml5Parser* aOwner, |
127 | | eParserMode aMode); |
128 | | |
129 | | // Methods that nsHtml5StreamListener calls |
130 | | nsresult CheckListenerChain(); |
131 | | |
132 | | nsresult OnStartRequest(nsIRequest* aRequest, nsISupports* aContext); |
133 | | |
134 | | nsresult OnDataAvailable(nsIRequest* aRequest, |
135 | | nsISupports* aContext, |
136 | | nsIInputStream* aInStream, |
137 | | uint64_t aSourceOffset, |
138 | | uint32_t aLength); |
139 | | |
140 | | nsresult OnStopRequest(nsIRequest* aRequest, |
141 | | nsISupports* aContext, |
142 | | nsresult status); |
143 | | |
144 | | // nsICharsetDetectionObserver |
145 | | /** |
146 | | * Chardet calls this to report the detection result |
147 | | */ |
148 | | NS_IMETHOD Notify(const char* aCharset, nsDetectionConfident aConf) override; |
149 | | |
150 | | // EncodingDeclarationHandler |
151 | | // https://hg.mozilla.org/projects/htmlparser/file/tip/src/nu/validator/htmlparser/common/EncodingDeclarationHandler.java |
152 | | /** |
153 | | * Tree builder uses this to report a late <meta charset> |
154 | | */ |
155 | | bool internalEncodingDeclaration(nsHtml5String aEncoding); |
156 | | |
157 | | // Not from an external interface |
158 | | |
159 | | /** |
160 | | * Call this method once you've created a parser, and want to instruct it |
161 | | * about what charset to load |
162 | | * |
163 | | * @param aEncoding the charset of a document |
164 | | * @param aCharsetSource the source of the charset |
165 | | */ |
166 | | inline void SetDocumentCharset(NotNull<const Encoding*> aEncoding, |
167 | | int32_t aSource) |
168 | 0 | { |
169 | 0 | MOZ_ASSERT(mStreamState == STREAM_NOT_STARTED, |
170 | 0 | "SetDocumentCharset called too late."); |
171 | 0 | NS_ASSERTION(NS_IsMainThread(), "Wrong thread!"); |
172 | 0 | mEncoding = aEncoding; |
173 | 0 | mCharsetSource = aSource; |
174 | 0 | } |
175 | | |
176 | | inline void SetObserver(nsIRequestObserver* aObserver) |
177 | 0 | { |
178 | 0 | NS_ASSERTION(NS_IsMainThread(), "Wrong thread!"); |
179 | 0 | mObserver = aObserver; |
180 | 0 | } |
181 | | |
182 | | nsresult GetChannel(nsIChannel** aChannel); |
183 | | |
184 | | /** |
185 | | * The owner parser must call this after script execution |
186 | | * when no scripts are executing and the document.written |
187 | | * buffer has been exhausted. |
188 | | */ |
189 | | void ContinueAfterScripts(nsHtml5Tokenizer* aTokenizer, |
190 | | nsHtml5TreeBuilder* aTreeBuilder, |
191 | | bool aLastWasCR); |
192 | | |
193 | | /** |
194 | | * Continues the stream parser if the charset switch failed. |
195 | | */ |
196 | | void ContinueAfterFailedCharsetSwitch(); |
197 | | |
198 | | void Terminate() |
199 | 0 | { |
200 | 0 | mozilla::MutexAutoLock autoLock(mTerminatedMutex); |
201 | 0 | mTerminated = true; |
202 | 0 | } |
203 | | |
204 | | void DropTimer(); |
205 | | |
206 | | /** |
207 | | * Sets mEncoding and mCharsetSource appropriately for the XML View Source |
208 | | * case if aEncoding names a supported rough ASCII superset and sets |
209 | | * the mEncoding and mCharsetSource to the UTF-8 default otherwise. |
210 | | */ |
211 | | void SetEncodingFromExpat(const char16_t* aEncoding); |
212 | | |
213 | | /** |
214 | | * Sets the URL for View Source title in case this parser ends up being |
215 | | * used for View Source. If aURL is a view-source: URL, takes the inner |
216 | | * URL. data: URLs are shown with an ellipsis instead of the actual data. |
217 | | */ |
218 | | void SetViewSourceTitle(nsIURI* aURL); |
219 | | |
220 | | private: |
221 | | virtual ~nsHtml5StreamParser(); |
222 | | |
223 | | #ifdef DEBUG |
224 | | bool IsParserThread() { return mEventTarget->IsOnCurrentThread(); } |
225 | | #endif |
226 | | |
227 | | void MarkAsBroken(nsresult aRv); |
228 | | |
229 | | /** |
230 | | * Marks the stream parser as interrupted. If you ever add calls to this |
231 | | * method, be sure to review Uninterrupt usage very, very carefully to |
232 | | * avoid having a previous in-flight runnable cancel your Interrupt() |
233 | | * call on the other thread too soon. |
234 | | */ |
235 | | void Interrupt() |
236 | 0 | { |
237 | 0 | mozilla::MutexAutoLock autoLock(mTerminatedMutex); |
238 | 0 | mInterrupted = true; |
239 | 0 | } |
240 | | |
241 | | void Uninterrupt() |
242 | 0 | { |
243 | 0 | NS_ASSERTION(IsParserThread(), "Wrong thread!"); |
244 | 0 | mTokenizerMutex.AssertCurrentThreadOwns(); |
245 | 0 | // Not acquiring mTerminatedMutex because mTokenizerMutex is already |
246 | 0 | // held at this point and is already stronger. |
247 | 0 | mInterrupted = false; |
248 | 0 | } |
249 | | |
250 | | /** |
251 | | * Flushes the tree ops from the tree builder and disarms the flush |
252 | | * timer. |
253 | | */ |
254 | | void FlushTreeOpsAndDisarmTimer(); |
255 | | |
256 | | void ParseAvailableData(); |
257 | | |
258 | | void DoStopRequest(); |
259 | | |
260 | | void DoDataAvailable(const uint8_t* aBuffer, uint32_t aLength); |
261 | | |
262 | | static nsresult CopySegmentsToParser(nsIInputStream* aInStream, |
263 | | void* aClosure, |
264 | | const char* aFromSegment, |
265 | | uint32_t aToOffset, |
266 | | uint32_t aCount, |
267 | | uint32_t* aWriteCount); |
268 | | |
269 | | bool IsTerminatedOrInterrupted() |
270 | 0 | { |
271 | 0 | mozilla::MutexAutoLock autoLock(mTerminatedMutex); |
272 | 0 | return mTerminated || mInterrupted; |
273 | 0 | } |
274 | | |
275 | | bool IsTerminated() |
276 | 0 | { |
277 | 0 | mozilla::MutexAutoLock autoLock(mTerminatedMutex); |
278 | 0 | return mTerminated; |
279 | 0 | } |
280 | | |
281 | | /** |
282 | | * True when there is a Unicode decoder already |
283 | | */ |
284 | 0 | inline bool HasDecoder() { return !!mUnicodeDecoder; } |
285 | | |
286 | | /** |
287 | | * Push bytes from network when there is no Unicode decoder yet |
288 | | */ |
289 | | nsresult SniffStreamBytes(const uint8_t* aFromSegment, |
290 | | uint32_t aCount, |
291 | | uint32_t* aWriteCount); |
292 | | |
293 | | /** |
294 | | * Push bytes from network when there is a Unicode decoder already |
295 | | */ |
296 | | nsresult WriteStreamBytes(const uint8_t* aFromSegment, |
297 | | uint32_t aCount, |
298 | | uint32_t* aWriteCount); |
299 | | |
300 | | /** |
301 | | * Check whether every other byte in the sniffing buffer is zero. |
302 | | */ |
303 | | void SniffBOMlessUTF16BasicLatin(const uint8_t* aFromSegment, |
304 | | uint32_t aCountToSniffingLimit); |
305 | | |
306 | | /** |
307 | | * <meta charset> scan failed. Try chardet if applicable. After this, the |
308 | | * the parser will have some encoding even if a last resolt fallback. |
309 | | * |
310 | | * @param aFromSegment The current network buffer or null if the sniffing |
311 | | * buffer is being flushed due to network stream ending. |
312 | | * @param aCount The number of bytes in aFromSegment (ignored if |
313 | | * aFromSegment is null) |
314 | | * @param aWriteCount Return value for how many bytes got read from the |
315 | | * buffer. |
316 | | * @param aCountToSniffingLimit The number of unfilled slots in |
317 | | * mSniffingBuffer |
318 | | */ |
319 | | nsresult FinalizeSniffing(const uint8_t* aFromSegment, |
320 | | uint32_t aCount, |
321 | | uint32_t* aWriteCount, |
322 | | uint32_t aCountToSniffingLimit); |
323 | | |
324 | | /** |
325 | | * Set up the Unicode decoder and write the sniffing buffer into it |
326 | | * followed by the current network buffer. |
327 | | * |
328 | | * @param aFromSegment The current network buffer or null if the sniffing |
329 | | * buffer is being flushed due to network stream ending. |
330 | | * @param aCount The number of bytes in aFromSegment (ignored if |
331 | | * aFromSegment is null) |
332 | | * @param aWriteCount Return value for how many bytes got read from the |
333 | | * buffer. |
334 | | */ |
335 | | nsresult SetupDecodingAndWriteSniffingBufferAndCurrentSegment( |
336 | | const uint8_t* aFromSegment, |
337 | | uint32_t aCount, |
338 | | uint32_t* aWriteCount); |
339 | | |
340 | | /** |
341 | | * Initialize the Unicode decoder, mark the BOM as the source and |
342 | | * drop the sniffer. |
343 | | * |
344 | | * @param aDecoderCharsetName The name for the decoder's charset |
345 | | * (UTF-16BE, UTF-16LE or UTF-8; the BOM has |
346 | | * been swallowed) |
347 | | */ |
348 | | nsresult SetupDecodingFromBom(NotNull<const Encoding*> aEncoding); |
349 | | |
350 | | /** |
351 | | * Become confident or resolve and encoding name to its preferred form. |
352 | | * @param aEncoding the value of an internal encoding decl. Acts as an |
353 | | * out param, too, when the method returns true. |
354 | | * @return true if the parser needs to start using the new value of |
355 | | * aEncoding and false if the parser became confident or if |
356 | | * the encoding name did not specify a usable encoding |
357 | | */ |
358 | | const Encoding* PreferredForInternalEncodingDecl(const nsACString& aEncoding); |
359 | | |
360 | | /** |
361 | | * Callback for mFlushTimer. |
362 | | */ |
363 | | static void TimerCallback(nsITimer* aTimer, void* aClosure); |
364 | | |
365 | | /** |
366 | | * Parser thread entry point for (maybe) flushing the ops and posting |
367 | | * a flush runnable back on the main thread. |
368 | | */ |
369 | | void TimerFlush(); |
370 | | |
371 | | /** |
372 | | * Called when speculation fails. |
373 | | */ |
374 | 0 | void MaybeDisableFutureSpeculation() { mSpeculationFailureCount++; } |
375 | | |
376 | | /** |
377 | | * Used to check whether we're getting too many speculation failures and |
378 | | * should just stop trying. The 100 is picked pretty randomly to be not too |
379 | | * small (so most pages are not affected) but small enough that we don't end |
380 | | * up with failed speculations over and over in pathological cases. |
381 | | */ |
382 | 0 | bool IsSpeculationEnabled() { return mSpeculationFailureCount < 100; } |
383 | | |
384 | | /** |
385 | | * Dispatch an event to a Quantum DOM main thread-ish thread. |
386 | | * (Not the parser thread.) |
387 | | */ |
388 | | nsresult DispatchToMain(already_AddRefed<nsIRunnable>&& aRunnable); |
389 | | |
390 | | nsCOMPtr<nsIRequest> mRequest; |
391 | | nsCOMPtr<nsIRequestObserver> mObserver; |
392 | | |
393 | | /** |
394 | | * The document title to use if this turns out to be a View Source parser. |
395 | | */ |
396 | | nsCString mViewSourceTitle; |
397 | | |
398 | | /** |
399 | | * The Unicode decoder |
400 | | */ |
401 | | mozilla::UniquePtr<mozilla::Decoder> mUnicodeDecoder; |
402 | | |
403 | | /** |
404 | | * The buffer for sniffing the character encoding |
405 | | */ |
406 | | mozilla::UniquePtr<uint8_t[]> mSniffingBuffer; |
407 | | |
408 | | /** |
409 | | * The number of meaningful bytes in mSniffingBuffer |
410 | | */ |
411 | | uint32_t mSniffingLength; |
412 | | |
413 | | /** |
414 | | * BOM sniffing state |
415 | | */ |
416 | | eBomState mBomState; |
417 | | |
418 | | /** |
419 | | * <meta> prescan implementation |
420 | | */ |
421 | | nsAutoPtr<nsHtml5MetaScanner> mMetaScanner; |
422 | | |
423 | | // encoding-related stuff |
424 | | /** |
425 | | * The source (confidence) of the character encoding in use |
426 | | */ |
427 | | int32_t mCharsetSource; |
428 | | |
429 | | /** |
430 | | * The character encoding in use |
431 | | */ |
432 | | NotNull<const Encoding*> mEncoding; |
433 | | |
434 | | /** |
435 | | * Whether reparse is forbidden |
436 | | */ |
437 | | bool mReparseForbidden; |
438 | | |
439 | | // Portable parser objects |
440 | | /** |
441 | | * The first buffer in the pending UTF-16 buffer queue |
442 | | */ |
443 | | RefPtr<nsHtml5OwningUTF16Buffer> mFirstBuffer; |
444 | | |
445 | | /** |
446 | | * The last buffer in the pending UTF-16 buffer queue |
447 | | */ |
448 | | nsHtml5OwningUTF16Buffer* |
449 | | mLastBuffer; // weak ref; always points to |
450 | | // a buffer of the size NS_HTML5_STREAM_PARSER_READ_BUFFER_SIZE |
451 | | |
452 | | /** |
453 | | * The tree operation executor |
454 | | */ |
455 | | nsHtml5TreeOpExecutor* mExecutor; |
456 | | |
457 | | /** |
458 | | * The same as mExecutor->mDocument->mDocGroup. |
459 | | */ |
460 | | RefPtr<mozilla::dom::DocGroup> mDocGroup; |
461 | | |
462 | | /** |
463 | | * The HTML5 tree builder |
464 | | */ |
465 | | nsAutoPtr<nsHtml5TreeBuilder> mTreeBuilder; |
466 | | |
467 | | /** |
468 | | * The HTML5 tokenizer |
469 | | */ |
470 | | nsAutoPtr<nsHtml5Tokenizer> mTokenizer; |
471 | | |
472 | | /** |
473 | | * Makes sure the main thread can't mess the tokenizer state while it's |
474 | | * tokenizing. This mutex also protects the current speculation. |
475 | | */ |
476 | | mozilla::Mutex mTokenizerMutex; |
477 | | |
478 | | /** |
479 | | * The scoped atom table |
480 | | */ |
481 | | nsHtml5AtomTable mAtomTable; |
482 | | |
483 | | /** |
484 | | * The owner parser. |
485 | | */ |
486 | | RefPtr<nsHtml5Parser> mOwner; |
487 | | |
488 | | /** |
489 | | * Whether the last character tokenized was a carriage return (for CRLF) |
490 | | */ |
491 | | bool mLastWasCR; |
492 | | |
493 | | /** |
494 | | * For tracking stream life cycle |
495 | | */ |
496 | | eHtml5StreamState mStreamState; |
497 | | |
498 | | /** |
499 | | * Whether we are speculating. |
500 | | */ |
501 | | bool mSpeculating; |
502 | | |
503 | | /** |
504 | | * Whether the tokenizer has reached EOF. (Reset when stream rewinded.) |
505 | | */ |
506 | | bool mAtEOF; |
507 | | |
508 | | /** |
509 | | * The speculations. The mutex protects the nsTArray itself. |
510 | | * To access the queue of current speculation, mTokenizerMutex must be |
511 | | * obtained. |
512 | | * The current speculation is the last element |
513 | | */ |
514 | | nsTArray<nsAutoPtr<nsHtml5Speculation>> mSpeculations; |
515 | | mozilla::Mutex mSpeculationMutex; |
516 | | |
517 | | /** |
518 | | * Number of times speculation has failed for this parser. |
519 | | */ |
520 | | uint32_t mSpeculationFailureCount; |
521 | | |
522 | | /** |
523 | | * True to terminate early; protected by mTerminatedMutex |
524 | | */ |
525 | | bool mTerminated; |
526 | | bool mInterrupted; |
527 | | mozilla::Mutex mTerminatedMutex; |
528 | | |
529 | | /** |
530 | | * The thread this stream parser runs on. |
531 | | */ |
532 | | nsCOMPtr<nsISerialEventTarget> mEventTarget; |
533 | | |
534 | | nsCOMPtr<nsIRunnable> mExecutorFlusher; |
535 | | |
536 | | nsCOMPtr<nsIRunnable> mLoadFlusher; |
537 | | |
538 | | /** |
539 | | * The chardet instance if chardet is enabled. |
540 | | */ |
541 | | nsCOMPtr<nsICharsetDetector> mChardet; |
542 | | |
543 | | /** |
544 | | * If false, don't push data to chardet. |
545 | | */ |
546 | | bool mFeedChardet; |
547 | | |
548 | | /** |
549 | | * Whether the initial charset source was kCharsetFromParentFrame |
550 | | */ |
551 | | bool mInitialEncodingWasFromParentFrame; |
552 | | |
553 | | bool mHasHadErrors; |
554 | | |
555 | | /** |
556 | | * Timer for flushing tree ops once in a while when not speculating. |
557 | | */ |
558 | | nsCOMPtr<nsITimer> mFlushTimer; |
559 | | |
560 | | /** |
561 | | * Mutex for protecting access to mFlushTimer (but not for the two |
562 | | * mFlushTimerFoo booleans below). |
563 | | */ |
564 | | mozilla::Mutex mFlushTimerMutex; |
565 | | |
566 | | /** |
567 | | * Keeps track whether mFlushTimer has been armed. Unfortunately, |
568 | | * nsITimer doesn't enable querying this from the timer itself. |
569 | | */ |
570 | | bool mFlushTimerArmed; |
571 | | |
572 | | /** |
573 | | * False initially and true after the timer has fired at least once. |
574 | | */ |
575 | | bool mFlushTimerEverFired; |
576 | | |
577 | | /** |
578 | | * Whether the parser is doing a normal parse, view source or plain text. |
579 | | */ |
580 | | eParserMode mMode; |
581 | | }; |
582 | | |
583 | | #endif // nsHtml5StreamParser_h |