Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/yaml/scanner.py: 11%

2# Scanner produces tokens of the following types:

3# STREAM-START

4# STREAM-END

5# DIRECTIVE(name, value)

6# DOCUMENT-START

7# DOCUMENT-END

8# BLOCK-SEQUENCE-START

9# BLOCK-MAPPING-START

10# BLOCK-END

11# FLOW-SEQUENCE-START

12# FLOW-MAPPING-START

13# FLOW-SEQUENCE-END

14# FLOW-MAPPING-END

15# BLOCK-ENTRY

16# FLOW-ENTRY

17# KEY

18# VALUE

19# ALIAS(value)

20# ANCHOR(value)

21# TAG(value)

22# SCALAR(value, plain, style)

23#

24# Read comments in the Scanner code for more details.

25#

27__all__ = ['Scanner', 'ScannerError']

29from .error import MarkedYAMLError

30from .tokens import *

32class ScannerError(MarkedYAMLError):

33 pass

35class SimpleKey:

36 # See below simple keys treatment.

38 def __init__(self, token_number, required, index, line, column, mark):

39 self.token_number = token_number

40 self.required = required

41 self.index = index

42 self.line = line

43 self.column = column

44 self.mark = mark

46class Scanner:

48 def __init__(self):

49 """Initialize the scanner."""

50 # It is assumed that Scanner and Reader will have a common descendant.

51 # Reader do the dirty work of checking for BOM and converting the

52 # input data to Unicode. It also adds NUL to the end.

53 #

54 # Reader supports the following methods

55 # self.peek(i=0) # peek the next i-th character

56 # self.prefix(l=1) # peek the next l characters

57 # self.forward(l=1) # read the next l characters and move the pointer.

59 # Had we reached the end of the stream?

60 self.done = False

62 # The number of unclosed '{' and '['. `flow_level == 0` means block

63 # context.

64 self.flow_level = 0

66 # List of processed tokens that are not yet emitted.

67 self.tokens = []

69 # Add the STREAM-START token.

70 self.fetch_stream_start()

72 # Number of tokens that were emitted through the `get_token` method.

73 self.tokens_taken = 0

75 # The current indentation level.

76 self.indent = -1

78 # Past indentation levels.

79 self.indents = []

81 # Variables related to simple keys treatment.

83 # A simple key is a key that is not denoted by the '?' indicator.

84 # Example of simple keys:

85 # ---

86 # block simple key: value

87 # ? not a simple key:

88 # : { flow simple key: value }

89 # We emit the KEY token before all keys, so when we find a potential

90 # simple key, we try to locate the corresponding ':' indicator.

91 # Simple keys should be limited to a single line and 1024 characters.

93 # Can a simple key start at the current position? A simple key may

94 # start:

95 # - at the beginning of the line, not counting indentation spaces

96 # (in block context),

97 # - after '{', '[', ',' (in the flow context),

98 # - after '?', ':', '-' (in the block context).

99 # In the block context, this flag also signifies if a block collection

100 # may start at the current position.

101 self.allow_simple_key = True

102

103 # Keep track of possible simple keys. This is a dictionary. The key

104 # is `flow_level`; there can be no more that one possible simple key

105 # for each level. The value is a SimpleKey record:

106 # (token_number, required, index, line, column, mark)

107 # A simple key may start with ALIAS, ANCHOR, TAG, SCALAR(flow),

108 # '[', or '{' tokens.

109 self.possible_simple_keys = {}

110

111 # Public methods.

112

113 def check_token(self, *choices):

114 # Check if the next token is one of the given types.

115 while self.need_more_tokens():

116 self.fetch_more_tokens()

117 if self.tokens:

118 if not choices:

119 return True

120 for choice in choices:

121 if isinstance(self.tokens[0], choice):

122 return True

123 return False

124

125 def peek_token(self):

126 # Return the next token, but do not delete if from the queue.

127 # Return None if no more tokens.

128 while self.need_more_tokens():

129 self.fetch_more_tokens()

130 if self.tokens:

131 return self.tokens[0]

132 else:

133 return None

134

135 def get_token(self):

136 # Return the next token.

137 while self.need_more_tokens():

138 self.fetch_more_tokens()

139 if self.tokens:

140 self.tokens_taken += 1

141 return self.tokens.pop(0)

142

143 # Private methods.

144

145 def need_more_tokens(self):

146 if self.done:

147 return False

148 if not self.tokens:

149 return True

150 # The current token may be a potential simple key, so we

151 # need to look further.

152 self.stale_possible_simple_keys()

153 if self.next_possible_simple_key() == self.tokens_taken:

154 return True

155

156 def fetch_more_tokens(self):

157

158 # Eat whitespaces and comments until we reach the next token.

159 self.scan_to_next_token()

160

161 # Remove obsolete possible simple keys.

162 self.stale_possible_simple_keys()

163

164 # Compare the current indentation and column. It may add some tokens

165 # and decrease the current indentation level.

166 self.unwind_indent(self.column)

167

168 # Peek the next character.

169 ch = self.peek()

170

171 # Is it the end of stream?

172 if ch == '\0':

173 return self.fetch_stream_end()

174

175 # Is it a directive?

176 if ch == '%' and self.check_directive():

177 return self.fetch_directive()

178

179 # Is it the document start?

180 if ch == '-' and self.check_document_start():

181 return self.fetch_document_start()

182

183 # Is it the document end?

184 if ch == '.' and self.check_document_end():

185 return self.fetch_document_end()

186

187 # TODO: support for BOM within a stream.

188 #if ch == '\uFEFF':

189 # return self.fetch_bom() <-- issue BOMToken

190

191 # Note: the order of the following checks is NOT significant.

192

193 # Is it the flow sequence start indicator?

194 if ch == '[':

195 return self.fetch_flow_sequence_start()

196

197 # Is it the flow mapping start indicator?

198 if ch == '{':

199 return self.fetch_flow_mapping_start()

200

201 # Is it the flow sequence end indicator?

202 if ch == ']':

203 return self.fetch_flow_sequence_end()

204

205 # Is it the flow mapping end indicator?

206 if ch == '}':

207 return self.fetch_flow_mapping_end()

208

209 # Is it the flow entry indicator?

210 if ch == ',':

211 return self.fetch_flow_entry()

212

213 # Is it the block entry indicator?

214 if ch == '-' and self.check_block_entry():

215 return self.fetch_block_entry()

216

217 # Is it the key indicator?

218 if ch == '?' and self.check_key():

219 return self.fetch_key()

220

221 # Is it the value indicator?

222 if ch == ':' and self.check_value():

223 return self.fetch_value()

224

225 # Is it an alias?

226 if ch == '*':

227 return self.fetch_alias()

228

229 # Is it an anchor?

230 if ch == '&':

231 return self.fetch_anchor()

232

233 # Is it a tag?

234 if ch == '!':

235 return self.fetch_tag()

236

237 # Is it a literal scalar?

238 if ch == '|' and not self.flow_level:

239 return self.fetch_literal()

240

241 # Is it a folded scalar?

242 if ch == '>' and not self.flow_level:

243 return self.fetch_folded()

244

245 # Is it a single quoted scalar?

246 if ch == '\'':

247 return self.fetch_single()

248

249 # Is it a double quoted scalar?

250 if ch == '\"':

251 return self.fetch_double()

252

253 # It must be a plain scalar then.

254 if self.check_plain():

255 return self.fetch_plain()

256

257 # No? It's an error. Let's produce a nice error message.

258 raise ScannerError("while scanning for the next token", None,

259 "found character %r that cannot start any token" % ch,

260 self.get_mark())

261

262 # Simple keys treatment.

263

264 def next_possible_simple_key(self):

265 # Return the number of the nearest possible simple key. Actually we

266 # don't need to loop through the whole dictionary. We may replace it

267 # with the following code:

268 # if not self.possible_simple_keys:

269 # return None

270 # return self.possible_simple_keys[

271 # min(self.possible_simple_keys.keys())].token_number

272 min_token_number = None

273 for level in self.possible_simple_keys:

274 key = self.possible_simple_keys[level]

275 if min_token_number is None or key.token_number < min_token_number:

276 min_token_number = key.token_number

277 return min_token_number

278

279 def stale_possible_simple_keys(self):

280 # Remove entries that are no longer possible simple keys. According to

281 # the YAML specification, simple keys

282 # - should be limited to a single line,

283 # - should be no longer than 1024 characters.

284 # Disabling this procedure will allow simple keys of any length and

285 # height (may cause problems if indentation is broken though).

286 for level in list(self.possible_simple_keys):

287 key = self.possible_simple_keys[level]

288 if key.line != self.line \

289 or self.index-key.index > 1024:

290 if key.required:

291 raise ScannerError("while scanning a simple key", key.mark,

292 "could not find expected ':'", self.get_mark())

293 del self.possible_simple_keys[level]

294

295 def save_possible_simple_key(self):

296 # The next token may start a simple key. We check if it's possible

297 # and save its position. This function is called for

298 # ALIAS, ANCHOR, TAG, SCALAR(flow), '[', and '{'.

299

300 # Check if a simple key is required at the current position.

301 required = not self.flow_level and self.indent == self.column

302

303 # The next token might be a simple key. Let's save it's number and

304 # position.

305 if self.allow_simple_key:

306 self.remove_possible_simple_key()

307 token_number = self.tokens_taken+len(self.tokens)

308 key = SimpleKey(token_number, required,

309 self.index, self.line, self.column, self.get_mark())

310 self.possible_simple_keys[self.flow_level] = key

311

312 def remove_possible_simple_key(self):

313 # Remove the saved possible key position at the current flow level.

314 if self.flow_level in self.possible_simple_keys:

315 key = self.possible_simple_keys[self.flow_level]

316

317 if key.required:

318 raise ScannerError("while scanning a simple key", key.mark,

319 "could not find expected ':'", self.get_mark())

320

321 del self.possible_simple_keys[self.flow_level]

322

323 # Indentation functions.

324

325 def unwind_indent(self, column):

326

327 ## In flow context, tokens should respect indentation.

328 ## Actually the condition should be `self.indent >= column` according to

329 ## the spec. But this condition will prohibit intuitively correct

330 ## constructions such as

331 ## key : {

332 ## }

333 #if self.flow_level and self.indent > column:

334 # raise ScannerError(None, None,

335 # "invalid indentation or unclosed '[' or '{'",

336 # self.get_mark())

337

338 # In the flow context, indentation is ignored. We make the scanner less

339 # restrictive then specification requires.

340 if self.flow_level:

341 return

342

343 # In block context, we may need to issue the BLOCK-END tokens.

344 while self.indent > column:

345 mark = self.get_mark()

346 self.indent = self.indents.pop()

347 self.tokens.append(BlockEndToken(mark, mark))

348

349 def add_indent(self, column):

350 # Check if we need to increase indentation.

351 if self.indent < column:

352 self.indents.append(self.indent)

353 self.indent = column

354 return True

355 return False

356

357 # Fetchers.

358

359 def fetch_stream_start(self):

360 # We always add STREAM-START as the first token and STREAM-END as the

361 # last token.

362

363 # Read the token.

364 mark = self.get_mark()

365

366 # Add STREAM-START.

367 self.tokens.append(StreamStartToken(mark, mark,

368 encoding=self.encoding))

369

370

371 def fetch_stream_end(self):

372

373 # Set the current indentation to -1.

374 self.unwind_indent(-1)

375

376 # Reset simple keys.

377 self.remove_possible_simple_key()

378 self.allow_simple_key = False

379 self.possible_simple_keys = {}

380

381 # Read the token.

382 mark = self.get_mark()

383

384 # Add STREAM-END.

385 self.tokens.append(StreamEndToken(mark, mark))

386

387 # The steam is finished.

388 self.done = True

389

390 def fetch_directive(self):

391

392 # Set the current indentation to -1.

393 self.unwind_indent(-1)

394

395 # Reset simple keys.

396 self.remove_possible_simple_key()

397 self.allow_simple_key = False

398

399 # Scan and add DIRECTIVE.

400 self.tokens.append(self.scan_directive())

401

402 def fetch_document_start(self):

403 self.fetch_document_indicator(DocumentStartToken)

404

405 def fetch_document_end(self):

406 self.fetch_document_indicator(DocumentEndToken)

407

408 def fetch_document_indicator(self, TokenClass):

409

410 # Set the current indentation to -1.

411 self.unwind_indent(-1)

412

413 # Reset simple keys. Note that there could not be a block collection

414 # after '---'.

415 self.remove_possible_simple_key()

416 self.allow_simple_key = False

417

418 # Add DOCUMENT-START or DOCUMENT-END.

419 start_mark = self.get_mark()

420 self.forward(3)

421 end_mark = self.get_mark()

422 self.tokens.append(TokenClass(start_mark, end_mark))

423

424 def fetch_flow_sequence_start(self):

425 self.fetch_flow_collection_start(FlowSequenceStartToken)

426

427 def fetch_flow_mapping_start(self):

428 self.fetch_flow_collection_start(FlowMappingStartToken)

429

430 def fetch_flow_collection_start(self, TokenClass):

431

432 # '[' and '{' may start a simple key.

433 self.save_possible_simple_key()

434

435 # Increase the flow level.

436 self.flow_level += 1

437

438 # Simple keys are allowed after '[' and '{'.

439 self.allow_simple_key = True

440

441 # Add FLOW-SEQUENCE-START or FLOW-MAPPING-START.

442 start_mark = self.get_mark()

443 self.forward()

444 end_mark = self.get_mark()

445 self.tokens.append(TokenClass(start_mark, end_mark))

446

447 def fetch_flow_sequence_end(self):

448 self.fetch_flow_collection_end(FlowSequenceEndToken)

449

450 def fetch_flow_mapping_end(self):

451 self.fetch_flow_collection_end(FlowMappingEndToken)

452

453 def fetch_flow_collection_end(self, TokenClass):

454

455 # Reset possible simple key on the current level.

456 self.remove_possible_simple_key()

457

458 # Decrease the flow level.

459 self.flow_level -= 1

460

461 # No simple keys after ']' or '}'.

462 self.allow_simple_key = False

463

464 # Add FLOW-SEQUENCE-END or FLOW-MAPPING-END.

465 start_mark = self.get_mark()

466 self.forward()

467 end_mark = self.get_mark()

468 self.tokens.append(TokenClass(start_mark, end_mark))

469

470 def fetch_flow_entry(self):

471

472 # Simple keys are allowed after ','.

473 self.allow_simple_key = True

474

475 # Reset possible simple key on the current level.

476 self.remove_possible_simple_key()

477

478 # Add FLOW-ENTRY.

479 start_mark = self.get_mark()

480 self.forward()

481 end_mark = self.get_mark()

482 self.tokens.append(FlowEntryToken(start_mark, end_mark))

483

484 def fetch_block_entry(self):

485

486 # Block context needs additional checks.

487 if not self.flow_level:

488

489 # Are we allowed to start a new entry?

490 if not self.allow_simple_key:

491 raise ScannerError(None, None,

492 "sequence entries are not allowed here",

493 self.get_mark())

494

495 # We may need to add BLOCK-SEQUENCE-START.

496 if self.add_indent(self.column):

497 mark = self.get_mark()

498 self.tokens.append(BlockSequenceStartToken(mark, mark))

499

500 # It's an error for the block entry to occur in the flow context,

501 # but we let the parser detect this.

502 else:

503 pass

504

505 # Simple keys are allowed after '-'.

506 self.allow_simple_key = True

507

508 # Reset possible simple key on the current level.

509 self.remove_possible_simple_key()

510

511 # Add BLOCK-ENTRY.

512 start_mark = self.get_mark()

513 self.forward()

514 end_mark = self.get_mark()

515 self.tokens.append(BlockEntryToken(start_mark, end_mark))

516

517 def fetch_key(self):

518

519 # Block context needs additional checks.

520 if not self.flow_level:

521

522 # Are we allowed to start a key (not necessary a simple)?

523 if not self.allow_simple_key:

524 raise ScannerError(None, None,

525 "mapping keys are not allowed here",

526 self.get_mark())

527

528 # We may need to add BLOCK-MAPPING-START.

529 if self.add_indent(self.column):

530 mark = self.get_mark()

531 self.tokens.append(BlockMappingStartToken(mark, mark))

532

533 # Simple keys are allowed after '?' in the block context.

534 self.allow_simple_key = not self.flow_level

535

536 # Reset possible simple key on the current level.

537 self.remove_possible_simple_key()

538

539 # Add KEY.

540 start_mark = self.get_mark()

541 self.forward()

542 end_mark = self.get_mark()

543 self.tokens.append(KeyToken(start_mark, end_mark))

544

545 def fetch_value(self):

546

547 # Do we determine a simple key?

548 if self.flow_level in self.possible_simple_keys:

549

550 # Add KEY.

551 key = self.possible_simple_keys[self.flow_level]

552 del self.possible_simple_keys[self.flow_level]

553 self.tokens.insert(key.token_number-self.tokens_taken,

554 KeyToken(key.mark, key.mark))

555

556 # If this key starts a new block mapping, we need to add

557 # BLOCK-MAPPING-START.

558 if not self.flow_level:

559 if self.add_indent(key.column):

560 self.tokens.insert(key.token_number-self.tokens_taken,

561 BlockMappingStartToken(key.mark, key.mark))

562

563 # There cannot be two simple keys one after another.

564 self.allow_simple_key = False

565

566 # It must be a part of a complex key.

567 else:

568

569 # Block context needs additional checks.

570 # (Do we really need them? They will be caught by the parser

571 # anyway.)

572 if not self.flow_level:

573

574 # We are allowed to start a complex value if and only if

575 # we can start a simple key.

576 if not self.allow_simple_key:

577 raise ScannerError(None, None,

578 "mapping values are not allowed here",

579 self.get_mark())

580

581 # If this value starts a new block mapping, we need to add

582 # BLOCK-MAPPING-START. It will be detected as an error later by

583 # the parser.

584 if not self.flow_level:

585 if self.add_indent(self.column):

586 mark = self.get_mark()

587 self.tokens.append(BlockMappingStartToken(mark, mark))

588

589 # Simple keys are allowed after ':' in the block context.

590 self.allow_simple_key = not self.flow_level

591

592 # Reset possible simple key on the current level.

593 self.remove_possible_simple_key()

594

595 # Add VALUE.

596 start_mark = self.get_mark()

597 self.forward()

598 end_mark = self.get_mark()

599 self.tokens.append(ValueToken(start_mark, end_mark))

600

601 def fetch_alias(self):

602

603 # ALIAS could be a simple key.

604 self.save_possible_simple_key()

605

606 # No simple keys after ALIAS.

607 self.allow_simple_key = False

608

609 # Scan and add ALIAS.

610 self.tokens.append(self.scan_anchor(AliasToken))

611

612 def fetch_anchor(self):

613

614 # ANCHOR could start a simple key.

615 self.save_possible_simple_key()

616

617 # No simple keys after ANCHOR.

618 self.allow_simple_key = False

619

620 # Scan and add ANCHOR.

621 self.tokens.append(self.scan_anchor(AnchorToken))

622

623 def fetch_tag(self):

624

625 # TAG could start a simple key.

626 self.save_possible_simple_key()

627

628 # No simple keys after TAG.

629 self.allow_simple_key = False

630

631 # Scan and add TAG.

632 self.tokens.append(self.scan_tag())

633

634 def fetch_literal(self):

635 self.fetch_block_scalar(style='|')

636

637 def fetch_folded(self):

638 self.fetch_block_scalar(style='>')

639

640 def fetch_block_scalar(self, style):

641

642 # A simple key may follow a block scalar.

643 self.allow_simple_key = True

644

645 # Reset possible simple key on the current level.

646 self.remove_possible_simple_key()

647

648 # Scan and add SCALAR.

649 self.tokens.append(self.scan_block_scalar(style))

650

651 def fetch_single(self):

652 self.fetch_flow_scalar(style='\'')

653

654 def fetch_double(self):

655 self.fetch_flow_scalar(style='"')

656

657 def fetch_flow_scalar(self, style):

658

659 # A flow scalar could be a simple key.

660 self.save_possible_simple_key()

661

662 # No simple keys after flow scalars.

663 self.allow_simple_key = False

664

665 # Scan and add SCALAR.

666 self.tokens.append(self.scan_flow_scalar(style))

667

668 def fetch_plain(self):

669

670 # A plain scalar could be a simple key.

671 self.save_possible_simple_key()

672

673 # No simple keys after plain scalars. But note that `scan_plain` will

674 # change this flag if the scan is finished at the beginning of the

675 # line.

676 self.allow_simple_key = False

677

678 # Scan and add SCALAR. May change `allow_simple_key`.

679 self.tokens.append(self.scan_plain())

680

681 # Checkers.

682

683 def check_directive(self):

684

685 # DIRECTIVE: ^ '%' ...

686 # The '%' indicator is already checked.

687 if self.column == 0:

688 return True

689

690 def check_document_start(self):

691

692 # DOCUMENT-START: ^ '---' (' '|'\n')

693 if self.column == 0:

694 if self.prefix(3) == '---' \

695 and self.peek(3) in '\0 \t\r\n\x85\u2028\u2029':

696 return True

697

698 def check_document_end(self):

699

700 # DOCUMENT-END: ^ '...' (' '|'\n')

701 if self.column == 0:

702 if self.prefix(3) == '...' \

703 and self.peek(3) in '\0 \t\r\n\x85\u2028\u2029':

704 return True

705

706 def check_block_entry(self):

707

708 # BLOCK-ENTRY: '-' (' '|'\n')

709 return self.peek(1) in '\0 \t\r\n\x85\u2028\u2029'

710

711 def check_key(self):

712

713 # KEY(flow context): '?'

714 if self.flow_level:

715 return True

716

717 # KEY(block context): '?' (' '|'\n')

718 else:

719 return self.peek(1) in '\0 \t\r\n\x85\u2028\u2029'

720

721 def check_value(self):

722

723 # VALUE(flow context): ':'

724 if self.flow_level:

725 return True

726

727 # VALUE(block context): ':' (' '|'\n')

728 else:

729 return self.peek(1) in '\0 \t\r\n\x85\u2028\u2029'

730

731 def check_plain(self):

732

733 # A plain scalar may start with any non-space character except:

734 # '-', '?', ':', ',', '[', ']', '{', '}',

735 # '#', '&', '*', '!', '|', '>', '\'', '\"',

736 # '%', '@', '`'.

737 #

738 # It may also start with

739 # '-', '?', ':'

740 # if it is followed by a non-space character.

741 #

742 # Note that we limit the last rule to the block context (except the

743 # '-' character) because we want the flow context to be space

744 # independent.

745 ch = self.peek()

746 return ch not in '\0 \t\r\n\x85\u2028\u2029-?:,[]{}#&*!|>\'\"%@`' \

747 or (self.peek(1) not in '\0 \t\r\n\x85\u2028\u2029'

748 and (ch == '-' or (not self.flow_level and ch in '?:')))

749

750 # Scanners.

751

752 def scan_to_next_token(self):

753 # We ignore spaces, line breaks and comments.

754 # If we find a line break in the block context, we set the flag

755 # `allow_simple_key` on.

756 # The byte order mark is stripped if it's the first character in the

757 # stream. We do not yet support BOM inside the stream as the

758 # specification requires. Any such mark will be considered as a part

759 # of the document.

760 #

761 # TODO: We need to make tab handling rules more sane. A good rule is

762 # Tabs cannot precede tokens

763 # BLOCK-SEQUENCE-START, BLOCK-MAPPING-START, BLOCK-END,

764 # KEY(block), VALUE(block), BLOCK-ENTRY

765 # So the checking code is

766 # if <TAB>:

767 # self.allow_simple_keys = False

768 # We also need to add the check for `allow_simple_keys == True` to

769 # `unwind_indent` before issuing BLOCK-END.

770 # Scanners for block, flow, and plain scalars need to be modified.

771

772 if self.index == 0 and self.peek() == '\uFEFF':

773 self.forward()

774 found = False

775 while not found:

776 while self.peek() == ' ':

777 self.forward()

778 if self.peek() == '#':

779 while self.peek() not in '\0\r\n\x85\u2028\u2029':

780 self.forward()

781 if self.scan_line_break():

782 if not self.flow_level:

783 self.allow_simple_key = True

784 else:

785 found = True

786

787 def scan_directive(self):

788 # See the specification for details.

789 start_mark = self.get_mark()

790 self.forward()

791 name = self.scan_directive_name(start_mark)

792 value = None

793 if name == 'YAML':

794 value = self.scan_yaml_directive_value(start_mark)

795 end_mark = self.get_mark()

796 elif name == 'TAG':

797 value = self.scan_tag_directive_value(start_mark)

798 end_mark = self.get_mark()

799 else:

800 end_mark = self.get_mark()

801 while self.peek() not in '\0\r\n\x85\u2028\u2029':

802 self.forward()

803 self.scan_directive_ignored_line(start_mark)

804 return DirectiveToken(name, value, start_mark, end_mark)

805

806 def scan_directive_name(self, start_mark):

807 # See the specification for details.

808 length = 0

809 ch = self.peek(length)

810 while '0' <= ch <= '9' or 'A' <= ch <= 'Z' or 'a' <= ch <= 'z' \

811 or ch in '-_':

812 length += 1

813 ch = self.peek(length)

814 if not length:

815 raise ScannerError("while scanning a directive", start_mark,

816 "expected alphabetic or numeric character, but found %r"

817 % ch, self.get_mark())

818 value = self.prefix(length)

819 self.forward(length)

820 ch = self.peek()

821 if ch not in '\0 \r\n\x85\u2028\u2029':

822 raise ScannerError("while scanning a directive", start_mark,

823 "expected alphabetic or numeric character, but found %r"

824 % ch, self.get_mark())

825 return value

826

827 def scan_yaml_directive_value(self, start_mark):

828 # See the specification for details.

829 while self.peek() == ' ':

830 self.forward()

831 major = self.scan_yaml_directive_number(start_mark)

832 if self.peek() != '.':

833 raise ScannerError("while scanning a directive", start_mark,

834 "expected a digit or '.', but found %r" % self.peek(),

835 self.get_mark())

836 self.forward()

837 minor = self.scan_yaml_directive_number(start_mark)

838 if self.peek() not in '\0 \r\n\x85\u2028\u2029':

839 raise ScannerError("while scanning a directive", start_mark,

840 "expected a digit or ' ', but found %r" % self.peek(),

841 self.get_mark())

842 return (major, minor)

843

844 def scan_yaml_directive_number(self, start_mark):

845 # See the specification for details.

846 ch = self.peek()

847 if not ('0' <= ch <= '9'):

848 raise ScannerError("while scanning a directive", start_mark,

849 "expected a digit, but found %r" % ch, self.get_mark())

850 length = 0

851 while '0' <= self.peek(length) <= '9':

852 length += 1

853 value = int(self.prefix(length))

854 self.forward(length)

855 return value

856

857 def scan_tag_directive_value(self, start_mark):

858 # See the specification for details.

859 while self.peek() == ' ':

860 self.forward()

861 handle = self.scan_tag_directive_handle(start_mark)

862 while self.peek() == ' ':

863 self.forward()

864 prefix = self.scan_tag_directive_prefix(start_mark)

865 return (handle, prefix)

866

867 def scan_tag_directive_handle(self, start_mark):

868 # See the specification for details.

869 value = self.scan_tag_handle('directive', start_mark)

870 ch = self.peek()

871 if ch != ' ':

872 raise ScannerError("while scanning a directive", start_mark,

873 "expected ' ', but found %r" % ch, self.get_mark())

874 return value

875

876 def scan_tag_directive_prefix(self, start_mark):

877 # See the specification for details.

878 value = self.scan_tag_uri('directive', start_mark)

879 ch = self.peek()

880 if ch not in '\0 \r\n\x85\u2028\u2029':

881 raise ScannerError("while scanning a directive", start_mark,

882 "expected ' ', but found %r" % ch, self.get_mark())

883 return value

884

885 def scan_directive_ignored_line(self, start_mark):

886 # See the specification for details.

887 while self.peek() == ' ':

888 self.forward()

889 if self.peek() == '#':

890 while self.peek() not in '\0\r\n\x85\u2028\u2029':

891 self.forward()

892 ch = self.peek()

893 if ch not in '\0\r\n\x85\u2028\u2029':

894 raise ScannerError("while scanning a directive", start_mark,

895 "expected a comment or a line break, but found %r"

896 % ch, self.get_mark())

897 self.scan_line_break()

898

899 def scan_anchor(self, TokenClass):

900 # The specification does not restrict characters for anchors and

901 # aliases. This may lead to problems, for instance, the document:

902 # [ *alias, value ]

903 # can be interpreted in two ways, as

904 # [ "value" ]

905 # and

906 # [ *alias , "value" ]

907 # Therefore we restrict aliases to numbers and ASCII letters.

908 start_mark = self.get_mark()

909 indicator = self.peek()

910 if indicator == '*':

911 name = 'alias'

912 else:

913 name = 'anchor'

914 self.forward()

915 length = 0

916 ch = self.peek(length)

917 while '0' <= ch <= '9' or 'A' <= ch <= 'Z' or 'a' <= ch <= 'z' \

918 or ch in '-_':

919 length += 1

920 ch = self.peek(length)

921 if not length:

922 raise ScannerError("while scanning an %s" % name, start_mark,

923 "expected alphabetic or numeric character, but found %r"

924 % ch, self.get_mark())

925 value = self.prefix(length)

926 self.forward(length)

927 ch = self.peek()

928 if ch not in '\0 \t\r\n\x85\u2028\u2029?:,]}%@`':

929 raise ScannerError("while scanning an %s" % name, start_mark,

930 "expected alphabetic or numeric character, but found %r"

931 % ch, self.get_mark())

932 end_mark = self.get_mark()

933 return TokenClass(value, start_mark, end_mark)

934

935 def scan_tag(self):

936 # See the specification for details.

937 start_mark = self.get_mark()

938 ch = self.peek(1)

939 if ch == '<':

940 handle = None

941 self.forward(2)

942 suffix = self.scan_tag_uri('tag', start_mark)

943 if self.peek() != '>':

944 raise ScannerError("while parsing a tag", start_mark,

945 "expected '>', but found %r" % self.peek(),

946 self.get_mark())

947 self.forward()

948 elif ch in '\0 \t\r\n\x85\u2028\u2029':

949 handle = None

950 suffix = '!'

951 self.forward()

952 else:

953 length = 1

954 use_handle = False

955 while ch not in '\0 \r\n\x85\u2028\u2029':

956 if ch == '!':

957 use_handle = True

958 break

959 length += 1

960 ch = self.peek(length)

961 handle = '!'

962 if use_handle:

963 handle = self.scan_tag_handle('tag', start_mark)

964 else:

965 handle = '!'

966 self.forward()

967 suffix = self.scan_tag_uri('tag', start_mark)

968 ch = self.peek()

969 if ch not in '\0 \r\n\x85\u2028\u2029':

970 raise ScannerError("while scanning a tag", start_mark,

971 "expected ' ', but found %r" % ch, self.get_mark())

972 value = (handle, suffix)

973 end_mark = self.get_mark()

974 return TagToken(value, start_mark, end_mark)

975

976 def scan_block_scalar(self, style):

977 # See the specification for details.

978

979 if style == '>':

980 folded = True

981 else:

982 folded = False

983

984 chunks = []

985 start_mark = self.get_mark()

986

987 # Scan the header.

988 self.forward()

989 chomping, increment = self.scan_block_scalar_indicators(start_mark)

990 self.scan_block_scalar_ignored_line(start_mark)

991

992 # Determine the indentation level and go to the first non-empty line.

993 min_indent = self.indent+1

994 if min_indent < 1:

995 min_indent = 1

996 if increment is None:

997 breaks, max_indent, end_mark = self.scan_block_scalar_indentation()

998 indent = max(min_indent, max_indent)

999 else:

1000 indent = min_indent+increment-1

1001 breaks, end_mark = self.scan_block_scalar_breaks(indent)

1002 line_break = ''

1003

1004 # Scan the inner part of the block scalar.

1005 while self.column == indent and self.peek() != '\0':

1006 chunks.extend(breaks)

1007 leading_non_space = self.peek() not in ' \t'

1008 length = 0

1009 while self.peek(length) not in '\0\r\n\x85\u2028\u2029':

1010 length += 1

1011 chunks.append(self.prefix(length))

1012 self.forward(length)

1013 line_break = self.scan_line_break()

1014 breaks, end_mark = self.scan_block_scalar_breaks(indent)

1015 if self.column == indent and self.peek() != '\0':

1016

1017 # Unfortunately, folding rules are ambiguous.

1018 #

1019 # This is the folding according to the specification:

1020

1021 if folded and line_break == '\n' \

1022 and leading_non_space and self.peek() not in ' \t':

1023 if not breaks:

1024 chunks.append(' ')

1025 else:

1026 chunks.append(line_break)

1027

1028 # This is Clark Evans's interpretation (also in the spec

1029 # examples):

1030 #

1031 #if folded and line_break == '\n':

1032 # if not breaks:

1033 # if self.peek() not in ' \t':

1034 # chunks.append(' ')

1035 # else:

1036 # chunks.append(line_break)

1037 #else:

1038 # chunks.append(line_break)

1039 else:

1040 break

1041

1042 # Chomp the tail.

1043 if chomping is not False:

1044 chunks.append(line_break)

1045 if chomping is True:

1046 chunks.extend(breaks)

1047

1048 # We are done.

1049 return ScalarToken(''.join(chunks), False, start_mark, end_mark,

1050 style)

1051

1052 def scan_block_scalar_indicators(self, start_mark):

1053 # See the specification for details.

1054 chomping = None

1055 increment = None

1056 ch = self.peek()

1057 if ch in '+-':

1058 if ch == '+':

1059 chomping = True

1060 else:

1061 chomping = False

1062 self.forward()

1063 ch = self.peek()

1064 if ch in '0123456789':

1065 increment = int(ch)

1066 if increment == 0:

1067 raise ScannerError("while scanning a block scalar", start_mark,

1068 "expected indentation indicator in the range 1-9, but found 0",

1069 self.get_mark())

1070 self.forward()

1071 elif ch in '0123456789':

1072 increment = int(ch)

1073 if increment == 0:

1074 raise ScannerError("while scanning a block scalar", start_mark,

1075 "expected indentation indicator in the range 1-9, but found 0",

1076 self.get_mark())

1077 self.forward()

1078 ch = self.peek()

1079 if ch in '+-':

1080 if ch == '+':

1081 chomping = True

1082 else:

1083 chomping = False

1084 self.forward()

1085 ch = self.peek()

1086 if ch not in '\0 \r\n\x85\u2028\u2029':

1087 raise ScannerError("while scanning a block scalar", start_mark,

1088 "expected chomping or indentation indicators, but found %r"

1089 % ch, self.get_mark())

1090 return chomping, increment

1091

1092 def scan_block_scalar_ignored_line(self, start_mark):

1093 # See the specification for details.

1094 while self.peek() == ' ':

1095 self.forward()

1096 if self.peek() == '#':

1097 while self.peek() not in '\0\r\n\x85\u2028\u2029':

1098 self.forward()

1099 ch = self.peek()

1100 if ch not in '\0\r\n\x85\u2028\u2029':

1101 raise ScannerError("while scanning a block scalar", start_mark,

1102 "expected a comment or a line break, but found %r" % ch,

1103 self.get_mark())

1104 self.scan_line_break()

1105

1106 def scan_block_scalar_indentation(self):

1107 # See the specification for details.

1108 chunks = []

1109 max_indent = 0

1110 end_mark = self.get_mark()

1111 while self.peek() in ' \r\n\x85\u2028\u2029':

1112 if self.peek() != ' ':

1113 chunks.append(self.scan_line_break())

1114 end_mark = self.get_mark()

1115 else:

1116 self.forward()

1117 if self.column > max_indent:

1118 max_indent = self.column

1119 return chunks, max_indent, end_mark

1120

1121 def scan_block_scalar_breaks(self, indent):

1122 # See the specification for details.

1123 chunks = []

1124 end_mark = self.get_mark()

1125 while self.column < indent and self.peek() == ' ':

1126 self.forward()

1127 while self.peek() in '\r\n\x85\u2028\u2029':

1128 chunks.append(self.scan_line_break())

1129 end_mark = self.get_mark()

1130 while self.column < indent and self.peek() == ' ':

1131 self.forward()

1132 return chunks, end_mark

1133

1134 def scan_flow_scalar(self, style):

1135 # See the specification for details.

1136 # Note that we loose indentation rules for quoted scalars. Quoted

1137 # scalars don't need to adhere indentation because " and ' clearly

1138 # mark the beginning and the end of them. Therefore we are less

1139 # restrictive then the specification requires. We only need to check

1140 # that document separators are not included in scalars.

1141 if style == '"':

1142 double = True

1143 else:

1144 double = False

1145 chunks = []

1146 start_mark = self.get_mark()

1147 quote = self.peek()

1148 self.forward()

1149 chunks.extend(self.scan_flow_scalar_non_spaces(double, start_mark))

1150 while self.peek() != quote:

1151 chunks.extend(self.scan_flow_scalar_spaces(double, start_mark))

1152 chunks.extend(self.scan_flow_scalar_non_spaces(double, start_mark))

1153 self.forward()

1154 end_mark = self.get_mark()

1155 return ScalarToken(''.join(chunks), False, start_mark, end_mark,

1156 style)

1157

1158 ESCAPE_REPLACEMENTS = {

1159 '0': '\0',

1160 'a': '\x07',

1161 'b': '\x08',

1162 't': '\x09',

1163 '\t': '\x09',

1164 'n': '\x0A',

1165 'v': '\x0B',

1166 'f': '\x0C',

1167 'r': '\x0D',

1168 'e': '\x1B',

1169 ' ': '\x20',

1170 '\"': '\"',

1171 '\\': '\\',

1172 '/': '/',

1173 'N': '\x85',

1174 '_': '\xA0',

1175 'L': '\u2028',

1176 'P': '\u2029',

1177 }

1178

1179 ESCAPE_CODES = {

1180 'x': 2,

1181 'u': 4,

1182 'U': 8,

1183 }

1184

1185 def scan_flow_scalar_non_spaces(self, double, start_mark):

1186 # See the specification for details.

1187 chunks = []

1188 while True:

1189 length = 0

1190 while self.peek(length) not in '\'\"\\\0 \t\r\n\x85\u2028\u2029':

1191 length += 1

1192 if length:

1193 chunks.append(self.prefix(length))

1194 self.forward(length)

1195 ch = self.peek()

1196 if not double and ch == '\'' and self.peek(1) == '\'':

1197 chunks.append('\'')

1198 self.forward(2)

1199 elif (double and ch == '\'') or (not double and ch in '\"\\'):

1200 chunks.append(ch)

1201 self.forward()

1202 elif double and ch == '\\':

1203 self.forward()

1204 ch = self.peek()

1205 if ch in self.ESCAPE_REPLACEMENTS:

1206 chunks.append(self.ESCAPE_REPLACEMENTS[ch])

1207 self.forward()

1208 elif ch in self.ESCAPE_CODES:

1209 length = self.ESCAPE_CODES[ch]

1210 self.forward()

1211 for k in range(length):

1212 if self.peek(k) not in '0123456789ABCDEFabcdef':

1213 raise ScannerError("while scanning a double-quoted scalar", start_mark,

1214 "expected escape sequence of %d hexadecimal numbers, but found %r" %

1215 (length, self.peek(k)), self.get_mark())

1216 code = int(self.prefix(length), 16)

1217 chunks.append(chr(code))

1218 self.forward(length)

1219 elif ch in '\r\n\x85\u2028\u2029':

1220 self.scan_line_break()

1221 chunks.extend(self.scan_flow_scalar_breaks(double, start_mark))

1222 else:

1223 raise ScannerError("while scanning a double-quoted scalar", start_mark,

1224 "found unknown escape character %r" % ch, self.get_mark())

1225 else:

1226 return chunks

1227

1228 def scan_flow_scalar_spaces(self, double, start_mark):

1229 # See the specification for details.

1230 chunks = []

1231 length = 0

1232 while self.peek(length) in ' \t':

1233 length += 1

1234 whitespaces = self.prefix(length)

1235 self.forward(length)

1236 ch = self.peek()

1237 if ch == '\0':

1238 raise ScannerError("while scanning a quoted scalar", start_mark,

1239 "found unexpected end of stream", self.get_mark())

1240 elif ch in '\r\n\x85\u2028\u2029':

1241 line_break = self.scan_line_break()

1242 breaks = self.scan_flow_scalar_breaks(double, start_mark)

1243 if line_break != '\n':

1244 chunks.append(line_break)

1245 elif not breaks:

1246 chunks.append(' ')

1247 chunks.extend(breaks)

1248 else:

1249 chunks.append(whitespaces)

1250 return chunks

1251

1252 def scan_flow_scalar_breaks(self, double, start_mark):

1253 # See the specification for details.

1254 chunks = []

1255 while True:

1256 # Instead of checking indentation, we check for document

1257 # separators.

1258 prefix = self.prefix(3)

1259 if (prefix == '---' or prefix == '...') \

1260 and self.peek(3) in '\0 \t\r\n\x85\u2028\u2029':

1261 raise ScannerError("while scanning a quoted scalar", start_mark,

1262 "found unexpected document separator", self.get_mark())

1263 while self.peek() in ' \t':

1264 self.forward()

1265 if self.peek() in '\r\n\x85\u2028\u2029':

1266 chunks.append(self.scan_line_break())

1267 else:

1268 return chunks

1269

1270 def scan_plain(self):

1271 # See the specification for details.

1272 # We add an additional restriction for the flow context:

1273 # plain scalars in the flow context cannot contain ',' or '?'.

1274 # We also keep track of the `allow_simple_key` flag here.

1275 # Indentation rules are loosed for the flow context.

1276 chunks = []

1277 start_mark = self.get_mark()

1278 end_mark = start_mark

1279 indent = self.indent+1

1280 # We allow zero indentation for scalars, but then we need to check for

1281 # document separators at the beginning of the line.

1282 #if indent == 0:

1283 # indent = 1

1284 spaces = []

1285 while True:

1286 length = 0

1287 if self.peek() == '#':

1288 break

1289 while True:

1290 ch = self.peek(length)

1291 if ch in '\0 \t\r\n\x85\u2028\u2029' \

1292 or (ch == ':' and

1293 self.peek(length+1) in '\0 \t\r\n\x85\u2028\u2029'

1294 + (u',[]{}' if self.flow_level else u''))\

1295 or (self.flow_level and ch in ',?[]{}'):

1296 break

1297 length += 1

1298 if length == 0:

1299 break

1300 self.allow_simple_key = False

1301 chunks.extend(spaces)

1302 chunks.append(self.prefix(length))

1303 self.forward(length)

1304 end_mark = self.get_mark()

1305 spaces = self.scan_plain_spaces(indent, start_mark)

1306 if not spaces or self.peek() == '#' \

1307 or (not self.flow_level and self.column < indent):

1308 break

1309 return ScalarToken(''.join(chunks), True, start_mark, end_mark)

1310

1311 def scan_plain_spaces(self, indent, start_mark):

1312 # See the specification for details.

1313 # The specification is really confusing about tabs in plain scalars.

1314 # We just forbid them completely. Do not use tabs in YAML!

1315 chunks = []

1316 length = 0

1317 while self.peek(length) in ' ':

1318 length += 1

1319 whitespaces = self.prefix(length)

1320 self.forward(length)

1321 ch = self.peek()

1322 if ch in '\r\n\x85\u2028\u2029':

1323 line_break = self.scan_line_break()

1324 self.allow_simple_key = True

1325 prefix = self.prefix(3)

1326 if (prefix == '---' or prefix == '...') \

1327 and self.peek(3) in '\0 \t\r\n\x85\u2028\u2029':

1328 return

1329 breaks = []

1330 while self.peek() in ' \r\n\x85\u2028\u2029':

1331 if self.peek() == ' ':

1332 self.forward()

1333 else:

1334 breaks.append(self.scan_line_break())

1335 prefix = self.prefix(3)

1336 if (prefix == '---' or prefix == '...') \

1337 and self.peek(3) in '\0 \t\r\n\x85\u2028\u2029':

1338 return

1339 if line_break != '\n':

1340 chunks.append(line_break)

1341 elif not breaks:

1342 chunks.append(' ')

1343 chunks.extend(breaks)

1344 elif whitespaces:

1345 chunks.append(whitespaces)

1346 return chunks

1347

1348 def scan_tag_handle(self, name, start_mark):

1349 # See the specification for details.

1350 # For some strange reasons, the specification does not allow '_' in

1351 # tag handles. I have allowed it anyway.

1352 ch = self.peek()

1353 if ch != '!':

1354 raise ScannerError("while scanning a %s" % name, start_mark,

1355 "expected '!', but found %r" % ch, self.get_mark())

1356 length = 1

1357 ch = self.peek(length)

1358 if ch != ' ':

1359 while '0' <= ch <= '9' or 'A' <= ch <= 'Z' or 'a' <= ch <= 'z' \

1360 or ch in '-_':

1361 length += 1

1362 ch = self.peek(length)

1363 if ch != '!':

1364 self.forward(length)

1365 raise ScannerError("while scanning a %s" % name, start_mark,

1366 "expected '!', but found %r" % ch, self.get_mark())

1367 length += 1

1368 value = self.prefix(length)

1369 self.forward(length)

1370 return value

1371

1372 def scan_tag_uri(self, name, start_mark):

1373 # See the specification for details.

1374 # Note: we do not check if URI is well-formed.

1375 chunks = []

1376 length = 0

1377 ch = self.peek(length)

1378 while '0' <= ch <= '9' or 'A' <= ch <= 'Z' or 'a' <= ch <= 'z' \

1379 or ch in '-;/?:@&=+$,_.!~*\'()[]%':

1380 if ch == '%':

1381 chunks.append(self.prefix(length))

1382 self.forward(length)

1383 length = 0

1384 chunks.append(self.scan_uri_escapes(name, start_mark))

1385 else:

1386 length += 1

1387 ch = self.peek(length)

1388 if length:

1389 chunks.append(self.prefix(length))

1390 self.forward(length)

1391 length = 0

1392 if not chunks:

1393 raise ScannerError("while parsing a %s" % name, start_mark,

1394 "expected URI, but found %r" % ch, self.get_mark())

1395 return ''.join(chunks)

1396

1397 def scan_uri_escapes(self, name, start_mark):

1398 # See the specification for details.

1399 codes = []

1400 mark = self.get_mark()

1401 while self.peek() == '%':

1402 self.forward()

1403 for k in range(2):

1404 if self.peek(k) not in '0123456789ABCDEFabcdef':

1405 raise ScannerError("while scanning a %s" % name, start_mark,

1406 "expected URI escape sequence of 2 hexadecimal numbers, but found %r"

1407 % self.peek(k), self.get_mark())

1408 codes.append(int(self.prefix(2), 16))

1409 self.forward(2)

1410 try:

1411 value = bytes(codes).decode('utf-8')

1412 except UnicodeDecodeError as exc:

1413 raise ScannerError("while scanning a %s" % name, start_mark, str(exc), mark)

1414 return value

1415

1416 def scan_line_break(self):

1417 # Transforms:

1418 # '\r\n' : '\n'

1419 # '\r' : '\n'

1420 # '\n' : '\n'

1421 # '\x85' : '\n'

1422 # '\u2028' : '\u2028'

1423 # '\u2029 : '\u2029'

1424 # default : ''

1425 ch = self.peek()

1426 if ch in '\r\n\x85':

1427 if self.prefix(2) == '\r\n':

1428 self.forward(2)

1429 else:

1430 self.forward()

1431 return '\n'

1432 elif ch in '\u2028\u2029':

1433 self.forward()

1434 return ch

1435 return ''