Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/yaml/scanner.py: 11%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

753 statements  

1 

2# Scanner produces tokens of the following types: 

3# STREAM-START 

4# STREAM-END 

5# DIRECTIVE(name, value) 

6# DOCUMENT-START 

7# DOCUMENT-END 

8# BLOCK-SEQUENCE-START 

9# BLOCK-MAPPING-START 

10# BLOCK-END 

11# FLOW-SEQUENCE-START 

12# FLOW-MAPPING-START 

13# FLOW-SEQUENCE-END 

14# FLOW-MAPPING-END 

15# BLOCK-ENTRY 

16# FLOW-ENTRY 

17# KEY 

18# VALUE 

19# ALIAS(value) 

20# ANCHOR(value) 

21# TAG(value) 

22# SCALAR(value, plain, style) 

23# 

24# Read comments in the Scanner code for more details. 

25# 

26 

27__all__ = ['Scanner', 'ScannerError'] 

28 

29from .error import MarkedYAMLError 

30from .tokens import * 

31 

32class ScannerError(MarkedYAMLError): 

33 pass 

34 

35class SimpleKey: 

36 # See below simple keys treatment. 

37 

38 def __init__(self, token_number, required, index, line, column, mark): 

39 self.token_number = token_number 

40 self.required = required 

41 self.index = index 

42 self.line = line 

43 self.column = column 

44 self.mark = mark 

45 

46class Scanner: 

47 

48 def __init__(self): 

49 """Initialize the scanner.""" 

50 # It is assumed that Scanner and Reader will have a common descendant. 

51 # Reader do the dirty work of checking for BOM and converting the 

52 # input data to Unicode. It also adds NUL to the end. 

53 # 

54 # Reader supports the following methods 

55 # self.peek(i=0) # peek the next i-th character 

56 # self.prefix(l=1) # peek the next l characters 

57 # self.forward(l=1) # read the next l characters and move the pointer. 

58 

59 # Had we reached the end of the stream? 

60 self.done = False 

61 

62 # The number of unclosed '{' and '['. `flow_level == 0` means block 

63 # context. 

64 self.flow_level = 0 

65 

66 # List of processed tokens that are not yet emitted. 

67 self.tokens = [] 

68 

69 # Add the STREAM-START token. 

70 self.fetch_stream_start() 

71 

72 # Number of tokens that were emitted through the `get_token` method. 

73 self.tokens_taken = 0 

74 

75 # The current indentation level. 

76 self.indent = -1 

77 

78 # Past indentation levels. 

79 self.indents = [] 

80 

81 # Variables related to simple keys treatment. 

82 

83 # A simple key is a key that is not denoted by the '?' indicator. 

84 # Example of simple keys: 

85 # --- 

86 # block simple key: value 

87 # ? not a simple key: 

88 # : { flow simple key: value } 

89 # We emit the KEY token before all keys, so when we find a potential 

90 # simple key, we try to locate the corresponding ':' indicator. 

91 # Simple keys should be limited to a single line and 1024 characters. 

92 

93 # Can a simple key start at the current position? A simple key may 

94 # start: 

95 # - at the beginning of the line, not counting indentation spaces 

96 # (in block context), 

97 # - after '{', '[', ',' (in the flow context), 

98 # - after '?', ':', '-' (in the block context). 

99 # In the block context, this flag also signifies if a block collection 

100 # may start at the current position. 

101 self.allow_simple_key = True 

102 

103 # Keep track of possible simple keys. This is a dictionary. The key 

104 # is `flow_level`; there can be no more that one possible simple key 

105 # for each level. The value is a SimpleKey record: 

106 # (token_number, required, index, line, column, mark) 

107 # A simple key may start with ALIAS, ANCHOR, TAG, SCALAR(flow), 

108 # '[', or '{' tokens. 

109 self.possible_simple_keys = {} 

110 

111 # Public methods. 

112 

113 def check_token(self, *choices): 

114 # Check if the next token is one of the given types. 

115 while self.need_more_tokens(): 

116 self.fetch_more_tokens() 

117 if self.tokens: 

118 if not choices: 

119 return True 

120 for choice in choices: 

121 if isinstance(self.tokens[0], choice): 

122 return True 

123 return False 

124 

125 def peek_token(self): 

126 # Return the next token, but do not delete if from the queue. 

127 # Return None if no more tokens. 

128 while self.need_more_tokens(): 

129 self.fetch_more_tokens() 

130 if self.tokens: 

131 return self.tokens[0] 

132 else: 

133 return None 

134 

135 def get_token(self): 

136 # Return the next token. 

137 while self.need_more_tokens(): 

138 self.fetch_more_tokens() 

139 if self.tokens: 

140 self.tokens_taken += 1 

141 return self.tokens.pop(0) 

142 

143 # Private methods. 

144 

145 def need_more_tokens(self): 

146 if self.done: 

147 return False 

148 if not self.tokens: 

149 return True 

150 # The current token may be a potential simple key, so we 

151 # need to look further. 

152 self.stale_possible_simple_keys() 

153 if self.next_possible_simple_key() == self.tokens_taken: 

154 return True 

155 

156 def fetch_more_tokens(self): 

157 

158 # Eat whitespaces and comments until we reach the next token. 

159 self.scan_to_next_token() 

160 

161 # Remove obsolete possible simple keys. 

162 self.stale_possible_simple_keys() 

163 

164 # Compare the current indentation and column. It may add some tokens 

165 # and decrease the current indentation level. 

166 self.unwind_indent(self.column) 

167 

168 # Peek the next character. 

169 ch = self.peek() 

170 

171 # Is it the end of stream? 

172 if ch == '\0': 

173 return self.fetch_stream_end() 

174 

175 # Is it a directive? 

176 if ch == '%' and self.check_directive(): 

177 return self.fetch_directive() 

178 

179 # Is it the document start? 

180 if ch == '-' and self.check_document_start(): 

181 return self.fetch_document_start() 

182 

183 # Is it the document end? 

184 if ch == '.' and self.check_document_end(): 

185 return self.fetch_document_end() 

186 

187 # TODO: support for BOM within a stream. 

188 #if ch == '\uFEFF': 

189 # return self.fetch_bom() <-- issue BOMToken 

190 

191 # Note: the order of the following checks is NOT significant. 

192 

193 # Is it the flow sequence start indicator? 

194 if ch == '[': 

195 return self.fetch_flow_sequence_start() 

196 

197 # Is it the flow mapping start indicator? 

198 if ch == '{': 

199 return self.fetch_flow_mapping_start() 

200 

201 # Is it the flow sequence end indicator? 

202 if ch == ']': 

203 return self.fetch_flow_sequence_end() 

204 

205 # Is it the flow mapping end indicator? 

206 if ch == '}': 

207 return self.fetch_flow_mapping_end() 

208 

209 # Is it the flow entry indicator? 

210 if ch == ',': 

211 return self.fetch_flow_entry() 

212 

213 # Is it the block entry indicator? 

214 if ch == '-' and self.check_block_entry(): 

215 return self.fetch_block_entry() 

216 

217 # Is it the key indicator? 

218 if ch == '?' and self.check_key(): 

219 return self.fetch_key() 

220 

221 # Is it the value indicator? 

222 if ch == ':' and self.check_value(): 

223 return self.fetch_value() 

224 

225 # Is it an alias? 

226 if ch == '*': 

227 return self.fetch_alias() 

228 

229 # Is it an anchor? 

230 if ch == '&': 

231 return self.fetch_anchor() 

232 

233 # Is it a tag? 

234 if ch == '!': 

235 return self.fetch_tag() 

236 

237 # Is it a literal scalar? 

238 if ch == '|' and not self.flow_level: 

239 return self.fetch_literal() 

240 

241 # Is it a folded scalar? 

242 if ch == '>' and not self.flow_level: 

243 return self.fetch_folded() 

244 

245 # Is it a single quoted scalar? 

246 if ch == '\'': 

247 return self.fetch_single() 

248 

249 # Is it a double quoted scalar? 

250 if ch == '\"': 

251 return self.fetch_double() 

252 

253 # It must be a plain scalar then. 

254 if self.check_plain(): 

255 return self.fetch_plain() 

256 

257 # No? It's an error. Let's produce a nice error message. 

258 raise ScannerError("while scanning for the next token", None, 

259 "found character %r that cannot start any token" % ch, 

260 self.get_mark()) 

261 

262 # Simple keys treatment. 

263 

264 def next_possible_simple_key(self): 

265 # Return the number of the nearest possible simple key. Actually we 

266 # don't need to loop through the whole dictionary. We may replace it 

267 # with the following code: 

268 # if not self.possible_simple_keys: 

269 # return None 

270 # return self.possible_simple_keys[ 

271 # min(self.possible_simple_keys.keys())].token_number 

272 min_token_number = None 

273 for level in self.possible_simple_keys: 

274 key = self.possible_simple_keys[level] 

275 if min_token_number is None or key.token_number < min_token_number: 

276 min_token_number = key.token_number 

277 return min_token_number 

278 

279 def stale_possible_simple_keys(self): 

280 # Remove entries that are no longer possible simple keys. According to 

281 # the YAML specification, simple keys 

282 # - should be limited to a single line, 

283 # - should be no longer than 1024 characters. 

284 # Disabling this procedure will allow simple keys of any length and 

285 # height (may cause problems if indentation is broken though). 

286 for level in list(self.possible_simple_keys): 

287 key = self.possible_simple_keys[level] 

288 if key.line != self.line \ 

289 or self.index-key.index > 1024: 

290 if key.required: 

291 raise ScannerError("while scanning a simple key", key.mark, 

292 "could not find expected ':'", self.get_mark()) 

293 del self.possible_simple_keys[level] 

294 

295 def save_possible_simple_key(self): 

296 # The next token may start a simple key. We check if it's possible 

297 # and save its position. This function is called for 

298 # ALIAS, ANCHOR, TAG, SCALAR(flow), '[', and '{'. 

299 

300 # Check if a simple key is required at the current position. 

301 required = not self.flow_level and self.indent == self.column 

302 

303 # The next token might be a simple key. Let's save it's number and 

304 # position. 

305 if self.allow_simple_key: 

306 self.remove_possible_simple_key() 

307 token_number = self.tokens_taken+len(self.tokens) 

308 key = SimpleKey(token_number, required, 

309 self.index, self.line, self.column, self.get_mark()) 

310 self.possible_simple_keys[self.flow_level] = key 

311 

312 def remove_possible_simple_key(self): 

313 # Remove the saved possible key position at the current flow level. 

314 if self.flow_level in self.possible_simple_keys: 

315 key = self.possible_simple_keys[self.flow_level] 

316 

317 if key.required: 

318 raise ScannerError("while scanning a simple key", key.mark, 

319 "could not find expected ':'", self.get_mark()) 

320 

321 del self.possible_simple_keys[self.flow_level] 

322 

323 # Indentation functions. 

324 

325 def unwind_indent(self, column): 

326 

327 ## In flow context, tokens should respect indentation. 

328 ## Actually the condition should be `self.indent >= column` according to 

329 ## the spec. But this condition will prohibit intuitively correct 

330 ## constructions such as 

331 ## key : { 

332 ## } 

333 #if self.flow_level and self.indent > column: 

334 # raise ScannerError(None, None, 

335 # "invalid indentation or unclosed '[' or '{'", 

336 # self.get_mark()) 

337 

338 # In the flow context, indentation is ignored. We make the scanner less 

339 # restrictive then specification requires. 

340 if self.flow_level: 

341 return 

342 

343 # In block context, we may need to issue the BLOCK-END tokens. 

344 while self.indent > column: 

345 mark = self.get_mark() 

346 self.indent = self.indents.pop() 

347 self.tokens.append(BlockEndToken(mark, mark)) 

348 

349 def add_indent(self, column): 

350 # Check if we need to increase indentation. 

351 if self.indent < column: 

352 self.indents.append(self.indent) 

353 self.indent = column 

354 return True 

355 return False 

356 

357 # Fetchers. 

358 

359 def fetch_stream_start(self): 

360 # We always add STREAM-START as the first token and STREAM-END as the 

361 # last token. 

362 

363 # Read the token. 

364 mark = self.get_mark() 

365 

366 # Add STREAM-START. 

367 self.tokens.append(StreamStartToken(mark, mark, 

368 encoding=self.encoding)) 

369 

370 

371 def fetch_stream_end(self): 

372 

373 # Set the current indentation to -1. 

374 self.unwind_indent(-1) 

375 

376 # Reset simple keys. 

377 self.remove_possible_simple_key() 

378 self.allow_simple_key = False 

379 self.possible_simple_keys = {} 

380 

381 # Read the token. 

382 mark = self.get_mark() 

383 

384 # Add STREAM-END. 

385 self.tokens.append(StreamEndToken(mark, mark)) 

386 

387 # The steam is finished. 

388 self.done = True 

389 

390 def fetch_directive(self): 

391 

392 # Set the current indentation to -1. 

393 self.unwind_indent(-1) 

394 

395 # Reset simple keys. 

396 self.remove_possible_simple_key() 

397 self.allow_simple_key = False 

398 

399 # Scan and add DIRECTIVE. 

400 self.tokens.append(self.scan_directive()) 

401 

402 def fetch_document_start(self): 

403 self.fetch_document_indicator(DocumentStartToken) 

404 

405 def fetch_document_end(self): 

406 self.fetch_document_indicator(DocumentEndToken) 

407 

408 def fetch_document_indicator(self, TokenClass): 

409 

410 # Set the current indentation to -1. 

411 self.unwind_indent(-1) 

412 

413 # Reset simple keys. Note that there could not be a block collection 

414 # after '---'. 

415 self.remove_possible_simple_key() 

416 self.allow_simple_key = False 

417 

418 # Add DOCUMENT-START or DOCUMENT-END. 

419 start_mark = self.get_mark() 

420 self.forward(3) 

421 end_mark = self.get_mark() 

422 self.tokens.append(TokenClass(start_mark, end_mark)) 

423 

424 def fetch_flow_sequence_start(self): 

425 self.fetch_flow_collection_start(FlowSequenceStartToken) 

426 

427 def fetch_flow_mapping_start(self): 

428 self.fetch_flow_collection_start(FlowMappingStartToken) 

429 

430 def fetch_flow_collection_start(self, TokenClass): 

431 

432 # '[' and '{' may start a simple key. 

433 self.save_possible_simple_key() 

434 

435 # Increase the flow level. 

436 self.flow_level += 1 

437 

438 # Simple keys are allowed after '[' and '{'. 

439 self.allow_simple_key = True 

440 

441 # Add FLOW-SEQUENCE-START or FLOW-MAPPING-START. 

442 start_mark = self.get_mark() 

443 self.forward() 

444 end_mark = self.get_mark() 

445 self.tokens.append(TokenClass(start_mark, end_mark)) 

446 

447 def fetch_flow_sequence_end(self): 

448 self.fetch_flow_collection_end(FlowSequenceEndToken) 

449 

450 def fetch_flow_mapping_end(self): 

451 self.fetch_flow_collection_end(FlowMappingEndToken) 

452 

453 def fetch_flow_collection_end(self, TokenClass): 

454 

455 # Reset possible simple key on the current level. 

456 self.remove_possible_simple_key() 

457 

458 # Decrease the flow level. 

459 self.flow_level -= 1 

460 

461 # No simple keys after ']' or '}'. 

462 self.allow_simple_key = False 

463 

464 # Add FLOW-SEQUENCE-END or FLOW-MAPPING-END. 

465 start_mark = self.get_mark() 

466 self.forward() 

467 end_mark = self.get_mark() 

468 self.tokens.append(TokenClass(start_mark, end_mark)) 

469 

470 def fetch_flow_entry(self): 

471 

472 # Simple keys are allowed after ','. 

473 self.allow_simple_key = True 

474 

475 # Reset possible simple key on the current level. 

476 self.remove_possible_simple_key() 

477 

478 # Add FLOW-ENTRY. 

479 start_mark = self.get_mark() 

480 self.forward() 

481 end_mark = self.get_mark() 

482 self.tokens.append(FlowEntryToken(start_mark, end_mark)) 

483 

484 def fetch_block_entry(self): 

485 

486 # Block context needs additional checks. 

487 if not self.flow_level: 

488 

489 # Are we allowed to start a new entry? 

490 if not self.allow_simple_key: 

491 raise ScannerError(None, None, 

492 "sequence entries are not allowed here", 

493 self.get_mark()) 

494 

495 # We may need to add BLOCK-SEQUENCE-START. 

496 if self.add_indent(self.column): 

497 mark = self.get_mark() 

498 self.tokens.append(BlockSequenceStartToken(mark, mark)) 

499 

500 # It's an error for the block entry to occur in the flow context, 

501 # but we let the parser detect this. 

502 else: 

503 pass 

504 

505 # Simple keys are allowed after '-'. 

506 self.allow_simple_key = True 

507 

508 # Reset possible simple key on the current level. 

509 self.remove_possible_simple_key() 

510 

511 # Add BLOCK-ENTRY. 

512 start_mark = self.get_mark() 

513 self.forward() 

514 end_mark = self.get_mark() 

515 self.tokens.append(BlockEntryToken(start_mark, end_mark)) 

516 

517 def fetch_key(self): 

518 

519 # Block context needs additional checks. 

520 if not self.flow_level: 

521 

522 # Are we allowed to start a key (not necessary a simple)? 

523 if not self.allow_simple_key: 

524 raise ScannerError(None, None, 

525 "mapping keys are not allowed here", 

526 self.get_mark()) 

527 

528 # We may need to add BLOCK-MAPPING-START. 

529 if self.add_indent(self.column): 

530 mark = self.get_mark() 

531 self.tokens.append(BlockMappingStartToken(mark, mark)) 

532 

533 # Simple keys are allowed after '?' in the block context. 

534 self.allow_simple_key = not self.flow_level 

535 

536 # Reset possible simple key on the current level. 

537 self.remove_possible_simple_key() 

538 

539 # Add KEY. 

540 start_mark = self.get_mark() 

541 self.forward() 

542 end_mark = self.get_mark() 

543 self.tokens.append(KeyToken(start_mark, end_mark)) 

544 

545 def fetch_value(self): 

546 

547 # Do we determine a simple key? 

548 if self.flow_level in self.possible_simple_keys: 

549 

550 # Add KEY. 

551 key = self.possible_simple_keys[self.flow_level] 

552 del self.possible_simple_keys[self.flow_level] 

553 self.tokens.insert(key.token_number-self.tokens_taken, 

554 KeyToken(key.mark, key.mark)) 

555 

556 # If this key starts a new block mapping, we need to add 

557 # BLOCK-MAPPING-START. 

558 if not self.flow_level: 

559 if self.add_indent(key.column): 

560 self.tokens.insert(key.token_number-self.tokens_taken, 

561 BlockMappingStartToken(key.mark, key.mark)) 

562 

563 # There cannot be two simple keys one after another. 

564 self.allow_simple_key = False 

565 

566 # It must be a part of a complex key. 

567 else: 

568 

569 # Block context needs additional checks. 

570 # (Do we really need them? They will be caught by the parser 

571 # anyway.) 

572 if not self.flow_level: 

573 

574 # We are allowed to start a complex value if and only if 

575 # we can start a simple key. 

576 if not self.allow_simple_key: 

577 raise ScannerError(None, None, 

578 "mapping values are not allowed here", 

579 self.get_mark()) 

580 

581 # If this value starts a new block mapping, we need to add 

582 # BLOCK-MAPPING-START. It will be detected as an error later by 

583 # the parser. 

584 if not self.flow_level: 

585 if self.add_indent(self.column): 

586 mark = self.get_mark() 

587 self.tokens.append(BlockMappingStartToken(mark, mark)) 

588 

589 # Simple keys are allowed after ':' in the block context. 

590 self.allow_simple_key = not self.flow_level 

591 

592 # Reset possible simple key on the current level. 

593 self.remove_possible_simple_key() 

594 

595 # Add VALUE. 

596 start_mark = self.get_mark() 

597 self.forward() 

598 end_mark = self.get_mark() 

599 self.tokens.append(ValueToken(start_mark, end_mark)) 

600 

601 def fetch_alias(self): 

602 

603 # ALIAS could be a simple key. 

604 self.save_possible_simple_key() 

605 

606 # No simple keys after ALIAS. 

607 self.allow_simple_key = False 

608 

609 # Scan and add ALIAS. 

610 self.tokens.append(self.scan_anchor(AliasToken)) 

611 

612 def fetch_anchor(self): 

613 

614 # ANCHOR could start a simple key. 

615 self.save_possible_simple_key() 

616 

617 # No simple keys after ANCHOR. 

618 self.allow_simple_key = False 

619 

620 # Scan and add ANCHOR. 

621 self.tokens.append(self.scan_anchor(AnchorToken)) 

622 

623 def fetch_tag(self): 

624 

625 # TAG could start a simple key. 

626 self.save_possible_simple_key() 

627 

628 # No simple keys after TAG. 

629 self.allow_simple_key = False 

630 

631 # Scan and add TAG. 

632 self.tokens.append(self.scan_tag()) 

633 

634 def fetch_literal(self): 

635 self.fetch_block_scalar(style='|') 

636 

637 def fetch_folded(self): 

638 self.fetch_block_scalar(style='>') 

639 

640 def fetch_block_scalar(self, style): 

641 

642 # A simple key may follow a block scalar. 

643 self.allow_simple_key = True 

644 

645 # Reset possible simple key on the current level. 

646 self.remove_possible_simple_key() 

647 

648 # Scan and add SCALAR. 

649 self.tokens.append(self.scan_block_scalar(style)) 

650 

651 def fetch_single(self): 

652 self.fetch_flow_scalar(style='\'') 

653 

654 def fetch_double(self): 

655 self.fetch_flow_scalar(style='"') 

656 

657 def fetch_flow_scalar(self, style): 

658 

659 # A flow scalar could be a simple key. 

660 self.save_possible_simple_key() 

661 

662 # No simple keys after flow scalars. 

663 self.allow_simple_key = False 

664 

665 # Scan and add SCALAR. 

666 self.tokens.append(self.scan_flow_scalar(style)) 

667 

668 def fetch_plain(self): 

669 

670 # A plain scalar could be a simple key. 

671 self.save_possible_simple_key() 

672 

673 # No simple keys after plain scalars. But note that `scan_plain` will 

674 # change this flag if the scan is finished at the beginning of the 

675 # line. 

676 self.allow_simple_key = False 

677 

678 # Scan and add SCALAR. May change `allow_simple_key`. 

679 self.tokens.append(self.scan_plain()) 

680 

681 # Checkers. 

682 

683 def check_directive(self): 

684 

685 # DIRECTIVE: ^ '%' ... 

686 # The '%' indicator is already checked. 

687 if self.column == 0: 

688 return True 

689 

690 def check_document_start(self): 

691 

692 # DOCUMENT-START: ^ '---' (' '|'\n') 

693 if self.column == 0: 

694 if self.prefix(3) == '---' \ 

695 and self.peek(3) in '\0 \t\r\n\x85\u2028\u2029': 

696 return True 

697 

698 def check_document_end(self): 

699 

700 # DOCUMENT-END: ^ '...' (' '|'\n') 

701 if self.column == 0: 

702 if self.prefix(3) == '...' \ 

703 and self.peek(3) in '\0 \t\r\n\x85\u2028\u2029': 

704 return True 

705 

706 def check_block_entry(self): 

707 

708 # BLOCK-ENTRY: '-' (' '|'\n') 

709 return self.peek(1) in '\0 \t\r\n\x85\u2028\u2029' 

710 

711 def check_key(self): 

712 

713 # KEY(flow context): '?' 

714 if self.flow_level: 

715 return True 

716 

717 # KEY(block context): '?' (' '|'\n') 

718 else: 

719 return self.peek(1) in '\0 \t\r\n\x85\u2028\u2029' 

720 

721 def check_value(self): 

722 

723 # VALUE(flow context): ':' 

724 if self.flow_level: 

725 return True 

726 

727 # VALUE(block context): ':' (' '|'\n') 

728 else: 

729 return self.peek(1) in '\0 \t\r\n\x85\u2028\u2029' 

730 

731 def check_plain(self): 

732 

733 # A plain scalar may start with any non-space character except: 

734 # '-', '?', ':', ',', '[', ']', '{', '}', 

735 # '#', '&', '*', '!', '|', '>', '\'', '\"', 

736 # '%', '@', '`'. 

737 # 

738 # It may also start with 

739 # '-', '?', ':' 

740 # if it is followed by a non-space character. 

741 # 

742 # Note that we limit the last rule to the block context (except the 

743 # '-' character) because we want the flow context to be space 

744 # independent. 

745 ch = self.peek() 

746 return ch not in '\0 \t\r\n\x85\u2028\u2029-?:,[]{}#&*!|>\'\"%@`' \ 

747 or (self.peek(1) not in '\0 \t\r\n\x85\u2028\u2029' 

748 and (ch == '-' or (not self.flow_level and ch in '?:'))) 

749 

750 # Scanners. 

751 

752 def scan_to_next_token(self): 

753 # We ignore spaces, line breaks and comments. 

754 # If we find a line break in the block context, we set the flag 

755 # `allow_simple_key` on. 

756 # The byte order mark is stripped if it's the first character in the 

757 # stream. We do not yet support BOM inside the stream as the 

758 # specification requires. Any such mark will be considered as a part 

759 # of the document. 

760 # 

761 # TODO: We need to make tab handling rules more sane. A good rule is 

762 # Tabs cannot precede tokens 

763 # BLOCK-SEQUENCE-START, BLOCK-MAPPING-START, BLOCK-END, 

764 # KEY(block), VALUE(block), BLOCK-ENTRY 

765 # So the checking code is 

766 # if <TAB>: 

767 # self.allow_simple_keys = False 

768 # We also need to add the check for `allow_simple_keys == True` to 

769 # `unwind_indent` before issuing BLOCK-END. 

770 # Scanners for block, flow, and plain scalars need to be modified. 

771 

772 if self.index == 0 and self.peek() == '\uFEFF': 

773 self.forward() 

774 found = False 

775 while not found: 

776 while self.peek() == ' ': 

777 self.forward() 

778 if self.peek() == '#': 

779 while self.peek() not in '\0\r\n\x85\u2028\u2029': 

780 self.forward() 

781 if self.scan_line_break(): 

782 if not self.flow_level: 

783 self.allow_simple_key = True 

784 else: 

785 found = True 

786 

787 def scan_directive(self): 

788 # See the specification for details. 

789 start_mark = self.get_mark() 

790 self.forward() 

791 name = self.scan_directive_name(start_mark) 

792 value = None 

793 if name == 'YAML': 

794 value = self.scan_yaml_directive_value(start_mark) 

795 end_mark = self.get_mark() 

796 elif name == 'TAG': 

797 value = self.scan_tag_directive_value(start_mark) 

798 end_mark = self.get_mark() 

799 else: 

800 end_mark = self.get_mark() 

801 while self.peek() not in '\0\r\n\x85\u2028\u2029': 

802 self.forward() 

803 self.scan_directive_ignored_line(start_mark) 

804 return DirectiveToken(name, value, start_mark, end_mark) 

805 

806 def scan_directive_name(self, start_mark): 

807 # See the specification for details. 

808 length = 0 

809 ch = self.peek(length) 

810 while '0' <= ch <= '9' or 'A' <= ch <= 'Z' or 'a' <= ch <= 'z' \ 

811 or ch in '-_': 

812 length += 1 

813 ch = self.peek(length) 

814 if not length: 

815 raise ScannerError("while scanning a directive", start_mark, 

816 "expected alphabetic or numeric character, but found %r" 

817 % ch, self.get_mark()) 

818 value = self.prefix(length) 

819 self.forward(length) 

820 ch = self.peek() 

821 if ch not in '\0 \r\n\x85\u2028\u2029': 

822 raise ScannerError("while scanning a directive", start_mark, 

823 "expected alphabetic or numeric character, but found %r" 

824 % ch, self.get_mark()) 

825 return value 

826 

827 def scan_yaml_directive_value(self, start_mark): 

828 # See the specification for details. 

829 while self.peek() == ' ': 

830 self.forward() 

831 major = self.scan_yaml_directive_number(start_mark) 

832 if self.peek() != '.': 

833 raise ScannerError("while scanning a directive", start_mark, 

834 "expected a digit or '.', but found %r" % self.peek(), 

835 self.get_mark()) 

836 self.forward() 

837 minor = self.scan_yaml_directive_number(start_mark) 

838 if self.peek() not in '\0 \r\n\x85\u2028\u2029': 

839 raise ScannerError("while scanning a directive", start_mark, 

840 "expected a digit or ' ', but found %r" % self.peek(), 

841 self.get_mark()) 

842 return (major, minor) 

843 

844 def scan_yaml_directive_number(self, start_mark): 

845 # See the specification for details. 

846 ch = self.peek() 

847 if not ('0' <= ch <= '9'): 

848 raise ScannerError("while scanning a directive", start_mark, 

849 "expected a digit, but found %r" % ch, self.get_mark()) 

850 length = 0 

851 while '0' <= self.peek(length) <= '9': 

852 length += 1 

853 value = int(self.prefix(length)) 

854 self.forward(length) 

855 return value 

856 

857 def scan_tag_directive_value(self, start_mark): 

858 # See the specification for details. 

859 while self.peek() == ' ': 

860 self.forward() 

861 handle = self.scan_tag_directive_handle(start_mark) 

862 while self.peek() == ' ': 

863 self.forward() 

864 prefix = self.scan_tag_directive_prefix(start_mark) 

865 return (handle, prefix) 

866 

867 def scan_tag_directive_handle(self, start_mark): 

868 # See the specification for details. 

869 value = self.scan_tag_handle('directive', start_mark) 

870 ch = self.peek() 

871 if ch != ' ': 

872 raise ScannerError("while scanning a directive", start_mark, 

873 "expected ' ', but found %r" % ch, self.get_mark()) 

874 return value 

875 

876 def scan_tag_directive_prefix(self, start_mark): 

877 # See the specification for details. 

878 value = self.scan_tag_uri('directive', start_mark) 

879 ch = self.peek() 

880 if ch not in '\0 \r\n\x85\u2028\u2029': 

881 raise ScannerError("while scanning a directive", start_mark, 

882 "expected ' ', but found %r" % ch, self.get_mark()) 

883 return value 

884 

885 def scan_directive_ignored_line(self, start_mark): 

886 # See the specification for details. 

887 while self.peek() == ' ': 

888 self.forward() 

889 if self.peek() == '#': 

890 while self.peek() not in '\0\r\n\x85\u2028\u2029': 

891 self.forward() 

892 ch = self.peek() 

893 if ch not in '\0\r\n\x85\u2028\u2029': 

894 raise ScannerError("while scanning a directive", start_mark, 

895 "expected a comment or a line break, but found %r" 

896 % ch, self.get_mark()) 

897 self.scan_line_break() 

898 

899 def scan_anchor(self, TokenClass): 

900 # The specification does not restrict characters for anchors and 

901 # aliases. This may lead to problems, for instance, the document: 

902 # [ *alias, value ] 

903 # can be interpreted in two ways, as 

904 # [ "value" ] 

905 # and 

906 # [ *alias , "value" ] 

907 # Therefore we restrict aliases to numbers and ASCII letters. 

908 start_mark = self.get_mark() 

909 indicator = self.peek() 

910 if indicator == '*': 

911 name = 'alias' 

912 else: 

913 name = 'anchor' 

914 self.forward() 

915 length = 0 

916 ch = self.peek(length) 

917 while '0' <= ch <= '9' or 'A' <= ch <= 'Z' or 'a' <= ch <= 'z' \ 

918 or ch in '-_': 

919 length += 1 

920 ch = self.peek(length) 

921 if not length: 

922 raise ScannerError("while scanning an %s" % name, start_mark, 

923 "expected alphabetic or numeric character, but found %r" 

924 % ch, self.get_mark()) 

925 value = self.prefix(length) 

926 self.forward(length) 

927 ch = self.peek() 

928 if ch not in '\0 \t\r\n\x85\u2028\u2029?:,]}%@`': 

929 raise ScannerError("while scanning an %s" % name, start_mark, 

930 "expected alphabetic or numeric character, but found %r" 

931 % ch, self.get_mark()) 

932 end_mark = self.get_mark() 

933 return TokenClass(value, start_mark, end_mark) 

934 

935 def scan_tag(self): 

936 # See the specification for details. 

937 start_mark = self.get_mark() 

938 ch = self.peek(1) 

939 if ch == '<': 

940 handle = None 

941 self.forward(2) 

942 suffix = self.scan_tag_uri('tag', start_mark) 

943 if self.peek() != '>': 

944 raise ScannerError("while parsing a tag", start_mark, 

945 "expected '>', but found %r" % self.peek(), 

946 self.get_mark()) 

947 self.forward() 

948 elif ch in '\0 \t\r\n\x85\u2028\u2029': 

949 handle = None 

950 suffix = '!' 

951 self.forward() 

952 else: 

953 length = 1 

954 use_handle = False 

955 while ch not in '\0 \r\n\x85\u2028\u2029': 

956 if ch == '!': 

957 use_handle = True 

958 break 

959 length += 1 

960 ch = self.peek(length) 

961 handle = '!' 

962 if use_handle: 

963 handle = self.scan_tag_handle('tag', start_mark) 

964 else: 

965 handle = '!' 

966 self.forward() 

967 suffix = self.scan_tag_uri('tag', start_mark) 

968 ch = self.peek() 

969 if ch not in '\0 \r\n\x85\u2028\u2029': 

970 raise ScannerError("while scanning a tag", start_mark, 

971 "expected ' ', but found %r" % ch, self.get_mark()) 

972 value = (handle, suffix) 

973 end_mark = self.get_mark() 

974 return TagToken(value, start_mark, end_mark) 

975 

976 def scan_block_scalar(self, style): 

977 # See the specification for details. 

978 

979 if style == '>': 

980 folded = True 

981 else: 

982 folded = False 

983 

984 chunks = [] 

985 start_mark = self.get_mark() 

986 

987 # Scan the header. 

988 self.forward() 

989 chomping, increment = self.scan_block_scalar_indicators(start_mark) 

990 self.scan_block_scalar_ignored_line(start_mark) 

991 

992 # Determine the indentation level and go to the first non-empty line. 

993 min_indent = self.indent+1 

994 if min_indent < 1: 

995 min_indent = 1 

996 if increment is None: 

997 breaks, max_indent, end_mark = self.scan_block_scalar_indentation() 

998 indent = max(min_indent, max_indent) 

999 else: 

1000 indent = min_indent+increment-1 

1001 breaks, end_mark = self.scan_block_scalar_breaks(indent) 

1002 line_break = '' 

1003 

1004 # Scan the inner part of the block scalar. 

1005 while self.column == indent and self.peek() != '\0': 

1006 chunks.extend(breaks) 

1007 leading_non_space = self.peek() not in ' \t' 

1008 length = 0 

1009 while self.peek(length) not in '\0\r\n\x85\u2028\u2029': 

1010 length += 1 

1011 chunks.append(self.prefix(length)) 

1012 self.forward(length) 

1013 line_break = self.scan_line_break() 

1014 breaks, end_mark = self.scan_block_scalar_breaks(indent) 

1015 if self.column == indent and self.peek() != '\0': 

1016 

1017 # Unfortunately, folding rules are ambiguous. 

1018 # 

1019 # This is the folding according to the specification: 

1020 

1021 if folded and line_break == '\n' \ 

1022 and leading_non_space and self.peek() not in ' \t': 

1023 if not breaks: 

1024 chunks.append(' ') 

1025 else: 

1026 chunks.append(line_break) 

1027 

1028 # This is Clark Evans's interpretation (also in the spec 

1029 # examples): 

1030 # 

1031 #if folded and line_break == '\n': 

1032 # if not breaks: 

1033 # if self.peek() not in ' \t': 

1034 # chunks.append(' ') 

1035 # else: 

1036 # chunks.append(line_break) 

1037 #else: 

1038 # chunks.append(line_break) 

1039 else: 

1040 break 

1041 

1042 # Chomp the tail. 

1043 if chomping is not False: 

1044 chunks.append(line_break) 

1045 if chomping is True: 

1046 chunks.extend(breaks) 

1047 

1048 # We are done. 

1049 return ScalarToken(''.join(chunks), False, start_mark, end_mark, 

1050 style) 

1051 

1052 def scan_block_scalar_indicators(self, start_mark): 

1053 # See the specification for details. 

1054 chomping = None 

1055 increment = None 

1056 ch = self.peek() 

1057 if ch in '+-': 

1058 if ch == '+': 

1059 chomping = True 

1060 else: 

1061 chomping = False 

1062 self.forward() 

1063 ch = self.peek() 

1064 if ch in '0123456789': 

1065 increment = int(ch) 

1066 if increment == 0: 

1067 raise ScannerError("while scanning a block scalar", start_mark, 

1068 "expected indentation indicator in the range 1-9, but found 0", 

1069 self.get_mark()) 

1070 self.forward() 

1071 elif ch in '0123456789': 

1072 increment = int(ch) 

1073 if increment == 0: 

1074 raise ScannerError("while scanning a block scalar", start_mark, 

1075 "expected indentation indicator in the range 1-9, but found 0", 

1076 self.get_mark()) 

1077 self.forward() 

1078 ch = self.peek() 

1079 if ch in '+-': 

1080 if ch == '+': 

1081 chomping = True 

1082 else: 

1083 chomping = False 

1084 self.forward() 

1085 ch = self.peek() 

1086 if ch not in '\0 \r\n\x85\u2028\u2029': 

1087 raise ScannerError("while scanning a block scalar", start_mark, 

1088 "expected chomping or indentation indicators, but found %r" 

1089 % ch, self.get_mark()) 

1090 return chomping, increment 

1091 

1092 def scan_block_scalar_ignored_line(self, start_mark): 

1093 # See the specification for details. 

1094 while self.peek() == ' ': 

1095 self.forward() 

1096 if self.peek() == '#': 

1097 while self.peek() not in '\0\r\n\x85\u2028\u2029': 

1098 self.forward() 

1099 ch = self.peek() 

1100 if ch not in '\0\r\n\x85\u2028\u2029': 

1101 raise ScannerError("while scanning a block scalar", start_mark, 

1102 "expected a comment or a line break, but found %r" % ch, 

1103 self.get_mark()) 

1104 self.scan_line_break() 

1105 

1106 def scan_block_scalar_indentation(self): 

1107 # See the specification for details. 

1108 chunks = [] 

1109 max_indent = 0 

1110 end_mark = self.get_mark() 

1111 while self.peek() in ' \r\n\x85\u2028\u2029': 

1112 if self.peek() != ' ': 

1113 chunks.append(self.scan_line_break()) 

1114 end_mark = self.get_mark() 

1115 else: 

1116 self.forward() 

1117 if self.column > max_indent: 

1118 max_indent = self.column 

1119 return chunks, max_indent, end_mark 

1120 

1121 def scan_block_scalar_breaks(self, indent): 

1122 # See the specification for details. 

1123 chunks = [] 

1124 end_mark = self.get_mark() 

1125 while self.column < indent and self.peek() == ' ': 

1126 self.forward() 

1127 while self.peek() in '\r\n\x85\u2028\u2029': 

1128 chunks.append(self.scan_line_break()) 

1129 end_mark = self.get_mark() 

1130 while self.column < indent and self.peek() == ' ': 

1131 self.forward() 

1132 return chunks, end_mark 

1133 

1134 def scan_flow_scalar(self, style): 

1135 # See the specification for details. 

1136 # Note that we loose indentation rules for quoted scalars. Quoted 

1137 # scalars don't need to adhere indentation because " and ' clearly 

1138 # mark the beginning and the end of them. Therefore we are less 

1139 # restrictive then the specification requires. We only need to check 

1140 # that document separators are not included in scalars. 

1141 if style == '"': 

1142 double = True 

1143 else: 

1144 double = False 

1145 chunks = [] 

1146 start_mark = self.get_mark() 

1147 quote = self.peek() 

1148 self.forward() 

1149 chunks.extend(self.scan_flow_scalar_non_spaces(double, start_mark)) 

1150 while self.peek() != quote: 

1151 chunks.extend(self.scan_flow_scalar_spaces(double, start_mark)) 

1152 chunks.extend(self.scan_flow_scalar_non_spaces(double, start_mark)) 

1153 self.forward() 

1154 end_mark = self.get_mark() 

1155 return ScalarToken(''.join(chunks), False, start_mark, end_mark, 

1156 style) 

1157 

1158 ESCAPE_REPLACEMENTS = { 

1159 '0': '\0', 

1160 'a': '\x07', 

1161 'b': '\x08', 

1162 't': '\x09', 

1163 '\t': '\x09', 

1164 'n': '\x0A', 

1165 'v': '\x0B', 

1166 'f': '\x0C', 

1167 'r': '\x0D', 

1168 'e': '\x1B', 

1169 ' ': '\x20', 

1170 '\"': '\"', 

1171 '\\': '\\', 

1172 '/': '/', 

1173 'N': '\x85', 

1174 '_': '\xA0', 

1175 'L': '\u2028', 

1176 'P': '\u2029', 

1177 } 

1178 

1179 ESCAPE_CODES = { 

1180 'x': 2, 

1181 'u': 4, 

1182 'U': 8, 

1183 } 

1184 

1185 def scan_flow_scalar_non_spaces(self, double, start_mark): 

1186 # See the specification for details. 

1187 chunks = [] 

1188 while True: 

1189 length = 0 

1190 while self.peek(length) not in '\'\"\\\0 \t\r\n\x85\u2028\u2029': 

1191 length += 1 

1192 if length: 

1193 chunks.append(self.prefix(length)) 

1194 self.forward(length) 

1195 ch = self.peek() 

1196 if not double and ch == '\'' and self.peek(1) == '\'': 

1197 chunks.append('\'') 

1198 self.forward(2) 

1199 elif (double and ch == '\'') or (not double and ch in '\"\\'): 

1200 chunks.append(ch) 

1201 self.forward() 

1202 elif double and ch == '\\': 

1203 self.forward() 

1204 ch = self.peek() 

1205 if ch in self.ESCAPE_REPLACEMENTS: 

1206 chunks.append(self.ESCAPE_REPLACEMENTS[ch]) 

1207 self.forward() 

1208 elif ch in self.ESCAPE_CODES: 

1209 length = self.ESCAPE_CODES[ch] 

1210 self.forward() 

1211 for k in range(length): 

1212 if self.peek(k) not in '0123456789ABCDEFabcdef': 

1213 raise ScannerError("while scanning a double-quoted scalar", start_mark, 

1214 "expected escape sequence of %d hexadecimal numbers, but found %r" % 

1215 (length, self.peek(k)), self.get_mark()) 

1216 code = int(self.prefix(length), 16) 

1217 chunks.append(chr(code)) 

1218 self.forward(length) 

1219 elif ch in '\r\n\x85\u2028\u2029': 

1220 self.scan_line_break() 

1221 chunks.extend(self.scan_flow_scalar_breaks(double, start_mark)) 

1222 else: 

1223 raise ScannerError("while scanning a double-quoted scalar", start_mark, 

1224 "found unknown escape character %r" % ch, self.get_mark()) 

1225 else: 

1226 return chunks 

1227 

1228 def scan_flow_scalar_spaces(self, double, start_mark): 

1229 # See the specification for details. 

1230 chunks = [] 

1231 length = 0 

1232 while self.peek(length) in ' \t': 

1233 length += 1 

1234 whitespaces = self.prefix(length) 

1235 self.forward(length) 

1236 ch = self.peek() 

1237 if ch == '\0': 

1238 raise ScannerError("while scanning a quoted scalar", start_mark, 

1239 "found unexpected end of stream", self.get_mark()) 

1240 elif ch in '\r\n\x85\u2028\u2029': 

1241 line_break = self.scan_line_break() 

1242 breaks = self.scan_flow_scalar_breaks(double, start_mark) 

1243 if line_break != '\n': 

1244 chunks.append(line_break) 

1245 elif not breaks: 

1246 chunks.append(' ') 

1247 chunks.extend(breaks) 

1248 else: 

1249 chunks.append(whitespaces) 

1250 return chunks 

1251 

1252 def scan_flow_scalar_breaks(self, double, start_mark): 

1253 # See the specification for details. 

1254 chunks = [] 

1255 while True: 

1256 # Instead of checking indentation, we check for document 

1257 # separators. 

1258 prefix = self.prefix(3) 

1259 if (prefix == '---' or prefix == '...') \ 

1260 and self.peek(3) in '\0 \t\r\n\x85\u2028\u2029': 

1261 raise ScannerError("while scanning a quoted scalar", start_mark, 

1262 "found unexpected document separator", self.get_mark()) 

1263 while self.peek() in ' \t': 

1264 self.forward() 

1265 if self.peek() in '\r\n\x85\u2028\u2029': 

1266 chunks.append(self.scan_line_break()) 

1267 else: 

1268 return chunks 

1269 

1270 def scan_plain(self): 

1271 # See the specification for details. 

1272 # We add an additional restriction for the flow context: 

1273 # plain scalars in the flow context cannot contain ',' or '?'. 

1274 # We also keep track of the `allow_simple_key` flag here. 

1275 # Indentation rules are loosed for the flow context. 

1276 chunks = [] 

1277 start_mark = self.get_mark() 

1278 end_mark = start_mark 

1279 indent = self.indent+1 

1280 # We allow zero indentation for scalars, but then we need to check for 

1281 # document separators at the beginning of the line. 

1282 #if indent == 0: 

1283 # indent = 1 

1284 spaces = [] 

1285 while True: 

1286 length = 0 

1287 if self.peek() == '#': 

1288 break 

1289 while True: 

1290 ch = self.peek(length) 

1291 if ch in '\0 \t\r\n\x85\u2028\u2029' \ 

1292 or (ch == ':' and 

1293 self.peek(length+1) in '\0 \t\r\n\x85\u2028\u2029' 

1294 + (u',[]{}' if self.flow_level else u''))\ 

1295 or (self.flow_level and ch in ',?[]{}'): 

1296 break 

1297 length += 1 

1298 if length == 0: 

1299 break 

1300 self.allow_simple_key = False 

1301 chunks.extend(spaces) 

1302 chunks.append(self.prefix(length)) 

1303 self.forward(length) 

1304 end_mark = self.get_mark() 

1305 spaces = self.scan_plain_spaces(indent, start_mark) 

1306 if not spaces or self.peek() == '#' \ 

1307 or (not self.flow_level and self.column < indent): 

1308 break 

1309 return ScalarToken(''.join(chunks), True, start_mark, end_mark) 

1310 

1311 def scan_plain_spaces(self, indent, start_mark): 

1312 # See the specification for details. 

1313 # The specification is really confusing about tabs in plain scalars. 

1314 # We just forbid them completely. Do not use tabs in YAML! 

1315 chunks = [] 

1316 length = 0 

1317 while self.peek(length) in ' ': 

1318 length += 1 

1319 whitespaces = self.prefix(length) 

1320 self.forward(length) 

1321 ch = self.peek() 

1322 if ch in '\r\n\x85\u2028\u2029': 

1323 line_break = self.scan_line_break() 

1324 self.allow_simple_key = True 

1325 prefix = self.prefix(3) 

1326 if (prefix == '---' or prefix == '...') \ 

1327 and self.peek(3) in '\0 \t\r\n\x85\u2028\u2029': 

1328 return 

1329 breaks = [] 

1330 while self.peek() in ' \r\n\x85\u2028\u2029': 

1331 if self.peek() == ' ': 

1332 self.forward() 

1333 else: 

1334 breaks.append(self.scan_line_break()) 

1335 prefix = self.prefix(3) 

1336 if (prefix == '---' or prefix == '...') \ 

1337 and self.peek(3) in '\0 \t\r\n\x85\u2028\u2029': 

1338 return 

1339 if line_break != '\n': 

1340 chunks.append(line_break) 

1341 elif not breaks: 

1342 chunks.append(' ') 

1343 chunks.extend(breaks) 

1344 elif whitespaces: 

1345 chunks.append(whitespaces) 

1346 return chunks 

1347 

1348 def scan_tag_handle(self, name, start_mark): 

1349 # See the specification for details. 

1350 # For some strange reasons, the specification does not allow '_' in 

1351 # tag handles. I have allowed it anyway. 

1352 ch = self.peek() 

1353 if ch != '!': 

1354 raise ScannerError("while scanning a %s" % name, start_mark, 

1355 "expected '!', but found %r" % ch, self.get_mark()) 

1356 length = 1 

1357 ch = self.peek(length) 

1358 if ch != ' ': 

1359 while '0' <= ch <= '9' or 'A' <= ch <= 'Z' or 'a' <= ch <= 'z' \ 

1360 or ch in '-_': 

1361 length += 1 

1362 ch = self.peek(length) 

1363 if ch != '!': 

1364 self.forward(length) 

1365 raise ScannerError("while scanning a %s" % name, start_mark, 

1366 "expected '!', but found %r" % ch, self.get_mark()) 

1367 length += 1 

1368 value = self.prefix(length) 

1369 self.forward(length) 

1370 return value 

1371 

1372 def scan_tag_uri(self, name, start_mark): 

1373 # See the specification for details. 

1374 # Note: we do not check if URI is well-formed. 

1375 chunks = [] 

1376 length = 0 

1377 ch = self.peek(length) 

1378 while '0' <= ch <= '9' or 'A' <= ch <= 'Z' or 'a' <= ch <= 'z' \ 

1379 or ch in '-;/?:@&=+$,_.!~*\'()[]%': 

1380 if ch == '%': 

1381 chunks.append(self.prefix(length)) 

1382 self.forward(length) 

1383 length = 0 

1384 chunks.append(self.scan_uri_escapes(name, start_mark)) 

1385 else: 

1386 length += 1 

1387 ch = self.peek(length) 

1388 if length: 

1389 chunks.append(self.prefix(length)) 

1390 self.forward(length) 

1391 length = 0 

1392 if not chunks: 

1393 raise ScannerError("while parsing a %s" % name, start_mark, 

1394 "expected URI, but found %r" % ch, self.get_mark()) 

1395 return ''.join(chunks) 

1396 

1397 def scan_uri_escapes(self, name, start_mark): 

1398 # See the specification for details. 

1399 codes = [] 

1400 mark = self.get_mark() 

1401 while self.peek() == '%': 

1402 self.forward() 

1403 for k in range(2): 

1404 if self.peek(k) not in '0123456789ABCDEFabcdef': 

1405 raise ScannerError("while scanning a %s" % name, start_mark, 

1406 "expected URI escape sequence of 2 hexadecimal numbers, but found %r" 

1407 % self.peek(k), self.get_mark()) 

1408 codes.append(int(self.prefix(2), 16)) 

1409 self.forward(2) 

1410 try: 

1411 value = bytes(codes).decode('utf-8') 

1412 except UnicodeDecodeError as exc: 

1413 raise ScannerError("while scanning a %s" % name, start_mark, str(exc), mark) 

1414 return value 

1415 

1416 def scan_line_break(self): 

1417 # Transforms: 

1418 # '\r\n' : '\n' 

1419 # '\r' : '\n' 

1420 # '\n' : '\n' 

1421 # '\x85' : '\n' 

1422 # '\u2028' : '\u2028' 

1423 # '\u2029 : '\u2029' 

1424 # default : '' 

1425 ch = self.peek() 

1426 if ch in '\r\n\x85': 

1427 if self.prefix(2) == '\r\n': 

1428 self.forward(2) 

1429 else: 

1430 self.forward() 

1431 return '\n' 

1432 elif ch in '\u2028\u2029': 

1433 self.forward() 

1434 return ch 

1435 return ''