/src/mozilla-central/parser/htmlparser/nsScanner.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ |
2 | | /* vim: set ts=2 sw=2 et tw=78: */ |
3 | | /* This Source Code Form is subject to the terms of the Mozilla Public |
4 | | * License, v. 2.0. If a copy of the MPL was not distributed with this |
5 | | * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ |
6 | | |
7 | | //#define __INCREMENTAL 1 |
8 | | |
9 | | #include "nsScanner.h" |
10 | | |
11 | | #include "mozilla/Attributes.h" |
12 | | #include "mozilla/DebugOnly.h" |
13 | | #include "mozilla/Encoding.h" |
14 | | #include "nsDebug.h" |
15 | | #include "nsReadableUtils.h" |
16 | | #include "nsIInputStream.h" |
17 | | #include "nsIFile.h" |
18 | | #include "nsUTF8Utils.h" // for LossyConvertEncoding |
19 | | #include "nsCRT.h" |
20 | | #include "nsParser.h" |
21 | | #include "nsCharsetSource.h" |
22 | | |
23 | | nsReadEndCondition::nsReadEndCondition(const char16_t* aTerminateChars) : |
24 | | mChars(aTerminateChars), mFilter(char16_t(~0)) // All bits set |
25 | 0 | { |
26 | 0 | // Build filter that will be used to filter out characters with |
27 | 0 | // bits that none of the terminal chars have. This works very well |
28 | 0 | // because terminal chars often have only the last 4-6 bits set and |
29 | 0 | // normal ascii letters have bit 7 set. Other letters have even higher |
30 | 0 | // bits set. |
31 | 0 | |
32 | 0 | // Calculate filter |
33 | 0 | const char16_t *current = aTerminateChars; |
34 | 0 | char16_t terminalChar = *current; |
35 | 0 | while (terminalChar) { |
36 | 0 | mFilter &= ~terminalChar; |
37 | 0 | ++current; |
38 | 0 | terminalChar = *current; |
39 | 0 | } |
40 | 0 | } |
41 | | |
42 | | /** |
43 | | * Use this constructor if you want i/o to be based on |
44 | | * a single string you hand in during construction. |
45 | | * This short cut was added for Javascript. |
46 | | * |
47 | | * @update gess 5/12/98 |
48 | | * @param aMode represents the parser mode (nav, other) |
49 | | * @return |
50 | | */ |
51 | | nsScanner::nsScanner(const nsAString& anHTMLString) |
52 | 0 | { |
53 | 0 | MOZ_COUNT_CTOR(nsScanner); |
54 | 0 |
|
55 | 0 | mSlidingBuffer = nullptr; |
56 | 0 | if (AppendToBuffer(anHTMLString)) { |
57 | 0 | mSlidingBuffer->BeginReading(mCurrentPosition); |
58 | 0 | } else { |
59 | 0 | /* XXX see hack below, re: bug 182067 */ |
60 | 0 | memset(&mCurrentPosition, 0, sizeof(mCurrentPosition)); |
61 | 0 | mEndPosition = mCurrentPosition; |
62 | 0 | } |
63 | 0 | mMarkPosition = mCurrentPosition; |
64 | 0 | mIncremental = false; |
65 | 0 | mUnicodeDecoder = nullptr; |
66 | 0 | mCharsetSource = kCharsetUninitialized; |
67 | 0 | } |
68 | | |
69 | | /** |
70 | | * Use this constructor if you want i/o to be based on strings |
71 | | * the scanner receives. If you pass a null filename, you |
72 | | * can still provide data to the scanner via append. |
73 | | */ |
74 | | nsScanner::nsScanner(nsString& aFilename, bool aCreateStream) |
75 | | : mFilename(aFilename) |
76 | 0 | { |
77 | 0 | MOZ_COUNT_CTOR(nsScanner); |
78 | 0 | NS_ASSERTION(!aCreateStream, "This is always true."); |
79 | 0 |
|
80 | 0 | mSlidingBuffer = nullptr; |
81 | 0 |
|
82 | 0 | // XXX This is a big hack. We need to initialize the iterators to something. |
83 | 0 | // What matters is that mCurrentPosition == mEndPosition, so that our methods |
84 | 0 | // believe that we are at EOF (see bug 182067). We null out mCurrentPosition |
85 | 0 | // so that we have some hope of catching null pointer dereferences associated |
86 | 0 | // with this hack. --darin |
87 | 0 | memset(&mCurrentPosition, 0, sizeof(mCurrentPosition)); |
88 | 0 | mMarkPosition = mCurrentPosition; |
89 | 0 | mEndPosition = mCurrentPosition; |
90 | 0 |
|
91 | 0 | mIncremental = true; |
92 | 0 |
|
93 | 0 | mUnicodeDecoder = nullptr; |
94 | 0 | mCharsetSource = kCharsetUninitialized; |
95 | 0 | // XML defaults to UTF-8 and about:blank is UTF-8, too. |
96 | 0 | SetDocumentCharset(UTF_8_ENCODING, kCharsetFromDocTypeDefault); |
97 | 0 | } |
98 | | |
99 | | nsresult nsScanner::SetDocumentCharset(NotNull<const Encoding*> aEncoding, |
100 | | int32_t aSource) |
101 | 0 | { |
102 | 0 | if (aSource < mCharsetSource) // priority is lower than the current one |
103 | 0 | return NS_OK; |
104 | 0 | |
105 | 0 | mCharsetSource = aSource; |
106 | 0 | nsCString charsetName; |
107 | 0 | aEncoding->Name(charsetName); |
108 | 0 | if (!mCharset.IsEmpty() && charsetName.Equals(mCharset)) { |
109 | 0 | return NS_OK; // no difference, don't change it |
110 | 0 | } |
111 | 0 | |
112 | 0 | // different, need to change it |
113 | 0 | |
114 | 0 | mCharset.Assign(charsetName); |
115 | 0 |
|
116 | 0 | mUnicodeDecoder = aEncoding->NewDecoderWithBOMRemoval(); |
117 | 0 |
|
118 | 0 | return NS_OK; |
119 | 0 | } |
120 | | |
121 | | |
122 | | /** |
123 | | * default destructor |
124 | | * |
125 | | * @update gess 3/25/98 |
126 | | * @param |
127 | | * @return |
128 | | */ |
129 | 0 | nsScanner::~nsScanner() { |
130 | 0 |
|
131 | 0 | delete mSlidingBuffer; |
132 | 0 |
|
133 | 0 | MOZ_COUNT_DTOR(nsScanner); |
134 | 0 | } |
135 | | |
136 | | /** |
137 | | * Resets current offset position of input stream to marked position. |
138 | | * This allows us to back up to this point if the need should arise, |
139 | | * such as when tokenization gets interrupted. |
140 | | * NOTE: IT IS REALLY BAD FORM TO CALL RELEASE WITHOUT CALLING MARK FIRST! |
141 | | * |
142 | | * @update gess 5/12/98 |
143 | | * @param |
144 | | * @return |
145 | | */ |
146 | 0 | void nsScanner::RewindToMark(void){ |
147 | 0 | if (mSlidingBuffer) { |
148 | 0 | mCurrentPosition = mMarkPosition; |
149 | 0 | } |
150 | 0 | } |
151 | | |
152 | | |
153 | | /** |
154 | | * Records current offset position in input stream. This allows us |
155 | | * to back up to this point if the need should arise, such as when |
156 | | * tokenization gets interrupted. |
157 | | * |
158 | | * @update gess 7/29/98 |
159 | | * @param |
160 | | * @return |
161 | | */ |
162 | 0 | int32_t nsScanner::Mark() { |
163 | 0 | int32_t distance = 0; |
164 | 0 | if (mSlidingBuffer) { |
165 | 0 | nsScannerIterator oldStart; |
166 | 0 | mSlidingBuffer->BeginReading(oldStart); |
167 | 0 |
|
168 | 0 | distance = Distance(oldStart, mCurrentPosition); |
169 | 0 |
|
170 | 0 | mSlidingBuffer->DiscardPrefix(mCurrentPosition); |
171 | 0 | mSlidingBuffer->BeginReading(mCurrentPosition); |
172 | 0 | mMarkPosition = mCurrentPosition; |
173 | 0 | } |
174 | 0 |
|
175 | 0 | return distance; |
176 | 0 | } |
177 | | |
178 | | /** |
179 | | * Insert data to our underlying input buffer as |
180 | | * if it were read from an input stream. |
181 | | * |
182 | | * @update harishd 01/12/99 |
183 | | * @return error code |
184 | | */ |
185 | 0 | bool nsScanner::UngetReadable(const nsAString& aBuffer) { |
186 | 0 | if (!mSlidingBuffer) { |
187 | 0 | return false; |
188 | 0 | } |
189 | 0 | |
190 | 0 | mSlidingBuffer->UngetReadable(aBuffer,mCurrentPosition); |
191 | 0 | mSlidingBuffer->BeginReading(mCurrentPosition); // Insertion invalidated our iterators |
192 | 0 | mSlidingBuffer->EndReading(mEndPosition); |
193 | 0 | |
194 | 0 | return true; |
195 | 0 | } |
196 | | |
197 | | /** |
198 | | * Append data to our underlying input buffer as |
199 | | * if it were read from an input stream. |
200 | | * |
201 | | * @update gess4/3/98 |
202 | | * @return error code |
203 | | */ |
204 | 0 | nsresult nsScanner::Append(const nsAString& aBuffer) { |
205 | 0 | if (!AppendToBuffer(aBuffer)) |
206 | 0 | return NS_ERROR_OUT_OF_MEMORY; |
207 | 0 | return NS_OK; |
208 | 0 | } |
209 | | |
210 | | /** |
211 | | * |
212 | | * |
213 | | * @update gess 5/21/98 |
214 | | * @param |
215 | | * @return |
216 | | */ |
217 | | nsresult nsScanner::Append(const char* aBuffer, uint32_t aLen) |
218 | 0 | { |
219 | 0 | nsresult res = NS_OK; |
220 | 0 | if (mUnicodeDecoder) { |
221 | 0 | CheckedInt<size_t> needed = mUnicodeDecoder->MaxUTF16BufferLength(aLen); |
222 | 0 | if (!needed.isValid()) { |
223 | 0 | return NS_ERROR_OUT_OF_MEMORY; |
224 | 0 | } |
225 | 0 | CheckedInt<uint32_t> allocLen(1); // null terminator due to legacy sadness |
226 | 0 | allocLen += needed.value(); |
227 | 0 | if (!allocLen.isValid()) { |
228 | 0 | return NS_ERROR_OUT_OF_MEMORY; |
229 | 0 | } |
230 | 0 | nsScannerString::Buffer* buffer = |
231 | 0 | nsScannerString::AllocBuffer(allocLen.value()); |
232 | 0 | NS_ENSURE_TRUE(buffer,NS_ERROR_OUT_OF_MEMORY); |
233 | 0 | char16_t *unichars = buffer->DataStart(); |
234 | 0 |
|
235 | 0 | uint32_t result; |
236 | 0 | size_t read; |
237 | 0 | size_t written; |
238 | 0 | Tie(result, read, written) = |
239 | 0 | mUnicodeDecoder->DecodeToUTF16WithoutReplacement( |
240 | 0 | AsBytes(MakeSpan(aBuffer, aLen)), |
241 | 0 | MakeSpan(unichars, needed.value()), |
242 | 0 | false); // Retain bug about failure to handle EOF |
243 | 0 | MOZ_ASSERT(result != kOutputFull); |
244 | 0 | MOZ_ASSERT(read <= aLen); |
245 | 0 | MOZ_ASSERT(written <= needed.value()); |
246 | 0 | if (result != kInputEmpty) { |
247 | 0 | // Since about:blank is empty, this line runs only for XML. Use a |
248 | 0 | // character that's illegal in XML instead of U+FFFD in order to make |
249 | 0 | // expat flag the error. There is no need to loop and convert more, since |
250 | 0 | // expat will stop here anyway. |
251 | 0 | unichars[written++] = 0xFFFF; |
252 | 0 | } |
253 | 0 | buffer->SetDataLength(written); |
254 | 0 | // Don't propagate return code of unicode decoder |
255 | 0 | // since it doesn't reflect on our success or failure |
256 | 0 | // - Ref. bug 87110 |
257 | 0 | res = NS_OK; |
258 | 0 | if (!AppendToBuffer(buffer)) |
259 | 0 | res = NS_ERROR_OUT_OF_MEMORY; |
260 | 0 | } |
261 | 0 | else { |
262 | 0 | NS_WARNING("No decoder found."); |
263 | 0 | res = NS_ERROR_FAILURE; |
264 | 0 | } |
265 | 0 |
|
266 | 0 | return res; |
267 | 0 | } |
268 | | |
269 | | /** |
270 | | * retrieve next char from scanners internal input stream |
271 | | * |
272 | | * @update gess 3/25/98 |
273 | | * @param |
274 | | * @return error code reflecting read status |
275 | | */ |
276 | 0 | nsresult nsScanner::GetChar(char16_t& aChar) { |
277 | 0 | if (!mSlidingBuffer || mCurrentPosition == mEndPosition) { |
278 | 0 | aChar = 0; |
279 | 0 | return NS_ERROR_HTMLPARSER_EOF; |
280 | 0 | } |
281 | 0 | |
282 | 0 | aChar = *mCurrentPosition++; |
283 | 0 |
|
284 | 0 | return NS_OK; |
285 | 0 | } |
286 | | |
287 | | void nsScanner::BindSubstring(nsScannerSubstring& aSubstring, const nsScannerIterator& aStart, const nsScannerIterator& aEnd) |
288 | 0 | { |
289 | 0 | aSubstring.Rebind(*mSlidingBuffer, aStart, aEnd); |
290 | 0 | } |
291 | | |
292 | | void nsScanner::CurrentPosition(nsScannerIterator& aPosition) |
293 | 0 | { |
294 | 0 | aPosition = mCurrentPosition; |
295 | 0 | } |
296 | | |
297 | | void nsScanner::EndReading(nsScannerIterator& aPosition) |
298 | 0 | { |
299 | 0 | aPosition = mEndPosition; |
300 | 0 | } |
301 | | |
302 | | void nsScanner::SetPosition(nsScannerIterator& aPosition, bool aTerminate) |
303 | 0 | { |
304 | 0 | if (mSlidingBuffer) { |
305 | 0 | mCurrentPosition = aPosition; |
306 | 0 | if (aTerminate && (mCurrentPosition == mEndPosition)) { |
307 | 0 | mMarkPosition = mCurrentPosition; |
308 | 0 | mSlidingBuffer->DiscardPrefix(mCurrentPosition); |
309 | 0 | } |
310 | 0 | } |
311 | 0 | } |
312 | | |
313 | | bool nsScanner::AppendToBuffer(nsScannerString::Buffer* aBuf) |
314 | 0 | { |
315 | 0 | if (!mSlidingBuffer) { |
316 | 0 | mSlidingBuffer = new nsScannerString(aBuf); |
317 | 0 | if (!mSlidingBuffer) |
318 | 0 | return false; |
319 | 0 | mSlidingBuffer->BeginReading(mCurrentPosition); |
320 | 0 | mMarkPosition = mCurrentPosition; |
321 | 0 | mSlidingBuffer->EndReading(mEndPosition); |
322 | 0 | } |
323 | 0 | else { |
324 | 0 | mSlidingBuffer->AppendBuffer(aBuf); |
325 | 0 | if (mCurrentPosition == mEndPosition) { |
326 | 0 | mSlidingBuffer->BeginReading(mCurrentPosition); |
327 | 0 | } |
328 | 0 | mSlidingBuffer->EndReading(mEndPosition); |
329 | 0 | } |
330 | 0 |
|
331 | 0 | return true; |
332 | 0 | } |
333 | | |
334 | | /** |
335 | | * call this to copy bytes out of the scanner that have not yet been consumed |
336 | | * by the tokenization process. |
337 | | * |
338 | | * @update gess 5/12/98 |
339 | | * @param aCopyBuffer is where the scanner buffer will be copied to |
340 | | * @return true if OK or false on OOM |
341 | | */ |
342 | 0 | bool nsScanner::CopyUnusedData(nsString& aCopyBuffer) { |
343 | 0 | if (!mSlidingBuffer) { |
344 | 0 | aCopyBuffer.Truncate(); |
345 | 0 | return true; |
346 | 0 | } |
347 | 0 | |
348 | 0 | nsScannerIterator start, end; |
349 | 0 | start = mCurrentPosition; |
350 | 0 | end = mEndPosition; |
351 | 0 |
|
352 | 0 | return CopyUnicodeTo(start, end, aCopyBuffer); |
353 | 0 | } |
354 | | |
355 | | /** |
356 | | * Retrieve the name of the file that the scanner is reading from. |
357 | | * In some cases, it's just a given name, because the scanner isn't |
358 | | * really reading from a file. |
359 | | * |
360 | | * @update gess 5/12/98 |
361 | | * @return |
362 | | */ |
363 | 0 | nsString& nsScanner::GetFilename(void) { |
364 | 0 | return mFilename; |
365 | 0 | } |
366 | | |
367 | | /** |
368 | | * Conduct self test. Actually, selftesting for this class |
369 | | * occurs in the parser selftest. |
370 | | * |
371 | | * @update gess 3/25/98 |
372 | | * @param |
373 | | * @return |
374 | | */ |
375 | | |
376 | 0 | void nsScanner::SelfTest(void) { |
377 | | #ifdef _DEBUG |
378 | | #endif |
379 | | } |