1# $Id$
2# Author: David Goodger <goodger@python.org>
3# Copyright: This module has been placed in the public domain.
4
5"""
6This is the ``docutils.parsers.rst.states`` module, the core of
7the reStructuredText parser. It defines the following:
8
9:Classes:
10 - `RSTStateMachine`: reStructuredText parser's entry point.
11 - `NestedStateMachine`: recursive StateMachine.
12 - `RSTState`: reStructuredText State superclass.
13 - `Inliner`: For parsing inline markup.
14 - `Body`: Generic classifier of the first line of a block.
15 - `SpecializedBody`: Superclass for compound element members.
16 - `BulletList`: Second and subsequent bullet_list list_items
17 - `DefinitionList`: Second+ definition_list_items.
18 - `EnumeratedList`: Second+ enumerated_list list_items.
19 - `FieldList`: Second+ fields.
20 - `OptionList`: Second+ option_list_items.
21 - `RFC2822List`: Second+ RFC2822-style fields.
22 - `ExtensionOptions`: Parses directive option fields.
23 - `Explicit`: Second+ explicit markup constructs.
24 - `SubstitutionDef`: For embedded directives in substitution definitions.
25 - `Text`: Classifier of second line of a text block.
26 - `SpecializedText`: Superclass for continuation lines of Text-variants.
27 - `Definition`: Second line of potential definition_list_item.
28 - `Line`: Second line of overlined section title or transition marker.
29 - `Struct`: obsolete, use `types.SimpleNamespace`.
30
31:Exception classes:
32 - `MarkupError`
33 - `ParserError`
34 - `MarkupMismatch`
35
36:Functions:
37 - `escape2null()`: Return a string, escape-backslashes converted to nulls.
38 - `unescape()`: Return a string, nulls removed or restored to backslashes.
39
40:Attributes:
41 - `state_classes`: set of State classes used with `RSTStateMachine`.
42
43Parser Overview
44===============
45
46The reStructuredText parser is implemented as a recursive state machine,
47examining its input one line at a time. To understand how the parser works,
48please first become familiar with the `docutils.statemachine` module. In the
49description below, references are made to classes defined in this module;
50please see the individual classes for details.
51
52Parsing proceeds as follows:
53
541. The state machine examines each line of input, checking each of the
55 transition patterns of the state `Body`, in order, looking for a match.
56 The implicit transitions (blank lines and indentation) are checked before
57 any others. The 'text' transition is a catch-all (matches anything).
58
592. The method associated with the matched transition pattern is called.
60
61 A. Some transition methods are self-contained, appending elements to the
62 document tree (`Body.doctest` parses a doctest block). The parser's
63 current line index is advanced to the end of the element, and parsing
64 continues with step 1.
65
66 B. Other transition methods trigger the creation of a nested state machine,
67 whose job is to parse a compound construct ('indent' does a block quote,
68 'bullet' does a bullet list, 'overline' does a section [first checking
69 for a valid section header], etc.).
70
71 - In the case of lists and explicit markup, a one-off state machine is
72 created and run to parse contents of the first item.
73
74 - A new state machine is created and its initial state is set to the
75 appropriate specialized state (`BulletList` in the case of the
76 'bullet' transition; see `SpecializedBody` for more detail). This
77 state machine is run to parse the compound element (or series of
78 explicit markup elements), and returns as soon as a non-member element
79 is encountered. For example, the `BulletList` state machine ends as
80 soon as it encounters an element which is not a list item of that
81 bullet list. The optional omission of inter-element blank lines is
82 enabled by this nested state machine.
83
84 - The current line index is advanced to the end of the elements parsed,
85 and parsing continues with step 1.
86
87 C. The result of the 'text' transition depends on the next line of text.
88 The current state is changed to `Text`, under which the second line is
89 examined. If the second line is:
90
91 - Indented: The element is a definition list item, and parsing proceeds
92 similarly to step 2.B, using the `DefinitionList` state.
93
94 - A line of uniform punctuation characters: The element is a section
95 header; again, parsing proceeds as in step 2.B, and `Body` is still
96 used.
97
98 - Anything else: The element is a paragraph, which is examined for
99 inline markup and appended to the parent element. Processing
100 continues with step 1.
101"""
102
103from __future__ import annotations
104
105__docformat__ = 'reStructuredText'
106
107import copy
108import re
109from types import FunctionType, MethodType
110from types import SimpleNamespace as Struct
111
112from docutils import nodes, statemachine, utils
113from docutils import ApplicationError, DataError
114from docutils.statemachine import StateMachineWS, StateWS
115from docutils.nodes import fully_normalize_name as normalize_name
116from docutils.nodes import unescape, whitespace_normalize_name
117import docutils.parsers.rst
118from docutils.parsers.rst import directives, languages, tableparser, roles
119from docutils.utils import escape2null, column_width
120from docutils.utils import punctuation_chars, urischemes
121from docutils.utils import split_escaped_whitespace
122from docutils.utils._roman_numerals import (InvalidRomanNumeralError,
123 RomanNumeral)
124
125TYPE_CHECKING = False
126if TYPE_CHECKING:
127 from docutils.statemachine import StringList
128
129
130class MarkupError(DataError): pass
131class UnknownInterpretedRoleError(DataError): pass
132class InterpretedRoleNotImplementedError(DataError): pass
133class ParserError(ApplicationError): pass
134class MarkupMismatch(Exception): pass
135
136
137class RSTStateMachine(StateMachineWS):
138
139 """
140 reStructuredText's master StateMachine.
141
142 The entry point to reStructuredText parsing is the `run()` method.
143 """
144
145 def run(self, input_lines, document, input_offset=0, match_titles=True,
146 inliner=None) -> None:
147 """
148 Parse `input_lines` and modify the `document` node in place.
149
150 Extend `StateMachineWS.run()`: set up parse-global data and
151 run the StateMachine.
152 """
153 self.language = languages.get_language(
154 document.settings.language_code, document.reporter)
155 self.match_titles = match_titles
156 if inliner is None:
157 inliner = Inliner()
158 inliner.init_customizations(document.settings)
159 # A collection of objects to share with nested parsers.
160 # The attributes `reporter`, `section_level`, and
161 # `section_bubble_up_kludge` will be removed in Docutils 2.0
162 self.memo = Struct(document=document,
163 reporter=document.reporter, # ignored
164 language=self.language,
165 title_styles=[],
166 section_level=0, # ignored
167 section_bubble_up_kludge=False, # ignored
168 inliner=inliner)
169 self.document = document
170 self.attach_observer(document.note_source)
171 self.reporter = self.document.reporter
172 self.node = document
173 results = StateMachineWS.run(self, input_lines, input_offset,
174 input_source=document['source'])
175 assert results == [], 'RSTStateMachine.run() results should be empty!'
176 self.node = self.memo = None # remove unneeded references
177
178
179class NestedStateMachine(StateMachineWS):
180 """
181 StateMachine run from within other StateMachine runs, to parse nested
182 document structures.
183 """
184
185 def run(self, input_lines, input_offset, memo, node, match_titles=True):
186 """
187 Parse `input_lines` and populate `node`.
188
189 Use a separate "title style hierarchy" (changed in Docutils 0.23).
190
191 Extend `StateMachineWS.run()`: set up document-wide data.
192 """
193 self.match_titles = match_titles
194 self.memo = copy.copy(memo)
195 self.document = memo.document
196 self.attach_observer(self.document.note_source)
197 self.language = memo.language
198 self.reporter = self.document.reporter
199 self.node = node
200 if match_titles:
201 # Use a separate section title style hierarchy;
202 # ensure all sections in the `input_lines` are treated as
203 # subsections of the current section by blocking lower
204 # section levels with a style that is impossible in rST:
205 self.memo.title_styles = ['x'] * len(node.section_hierarchy())
206 results = StateMachineWS.run(self, input_lines, input_offset)
207 assert results == [], ('NestedStateMachine.run() results should be '
208 'empty!')
209 return results
210
211
212class RSTState(StateWS):
213
214 """
215 reStructuredText State superclass.
216
217 Contains methods used by all State subclasses.
218 """
219
220 nested_sm = NestedStateMachine
221 nested_sm_cache = []
222
223 def __init__(self, state_machine, debug=False) -> None:
224 self.nested_sm_kwargs = {'state_classes': state_classes,
225 'initial_state': 'Body'}
226 StateWS.__init__(self, state_machine, debug)
227
228 def runtime_init(self) -> None:
229 StateWS.runtime_init(self)
230 memo = self.state_machine.memo
231 self.memo = memo
232 self.document = memo.document
233 self.inliner = memo.inliner
234 self.reporter = self.document.reporter
235 # enable the reporter to determine source and source-line
236 if not hasattr(self.reporter, 'get_source_and_line'):
237 self.reporter.get_source_and_line = self.state_machine.get_source_and_line # noqa:E501
238
239 @property
240 def parent(self) -> nodes.Element | None:
241 return self.state_machine.node
242
243 @parent.setter
244 def parent(self, value: nodes.Element):
245 self.state_machine.node = value
246
247 def goto_line(self, abs_line_offset) -> None:
248 """
249 Jump to input line `abs_line_offset`, ignoring jumps past the end.
250 """
251 try:
252 self.state_machine.goto_line(abs_line_offset)
253 except EOFError:
254 pass
255
256 def no_match(self, context, transitions):
257 """
258 Override `StateWS.no_match` to generate a system message.
259
260 This code should never be run.
261 """
262 self.reporter.severe(
263 'Internal error: no transition pattern match. State: "%s"; '
264 'transitions: %s; context: %s; current line: %r.'
265 % (self.__class__.__name__, transitions, context,
266 self.state_machine.line))
267 return context, None, []
268
269 def bof(self, context):
270 """Called at beginning of file."""
271 return [], []
272
273 def nested_parse(self,
274 block: StringList,
275 input_offset: int,
276 node: nodes.Element,
277 match_titles: bool = False,
278 state_machine_class: StateMachineWS|None = None,
279 state_machine_kwargs: dict|None = None
280 ) -> int:
281 """
282 Parse the input `block` with a nested state-machine rooted at `node`.
283
284 :block:
285 reStructuredText source extract.
286 :input_offset:
287 Line number at start of the block.
288 :node:
289 Base node. All generated nodes will be appended to this node.
290 :match_titles:
291 Allow section titles?
292 A separate section title style hierarchy is used for the nested
293 parsing (all sections are subsections of the current section).
294 The calling code should check whether sections are valid
295 children of the base node and move them or warn otherwise.
296 :state_machine_class:
297 Default: `NestedStateMachine`.
298 :state_machine_kwargs:
299 Keyword arguments for the state-machine instantiation.
300 Default: `self.nested_sm_kwargs`.
301
302 Create a new state-machine instance if required.
303 Return new offset.
304 """
305 use_default = 0
306 if state_machine_class is None:
307 state_machine_class = self.nested_sm
308 use_default += 1
309 if state_machine_kwargs is None:
310 state_machine_kwargs = self.nested_sm_kwargs
311 use_default += 1
312 state_machine = None
313 if use_default == 2:
314 try:
315 state_machine = self.nested_sm_cache.pop()
316 except IndexError:
317 pass
318 if not state_machine:
319 state_machine = state_machine_class(debug=self.debug,
320 **state_machine_kwargs)
321 # run the statemachine and populate `node`:
322 block_length = len(block)
323 state_machine.run(block, input_offset, memo=self.memo,
324 node=node, match_titles=match_titles)
325 # clean up
326 if use_default == 2:
327 self.nested_sm_cache.append(state_machine)
328 else:
329 state_machine.unlink()
330 new_offset = state_machine.abs_line_offset()
331 # No `block.parent` implies disconnected -- lines aren't in sync:
332 if block.parent and (len(block) - block_length) != 0:
333 # Adjustment for block if modified in nested parse:
334 self.state_machine.next_line(len(block) - block_length)
335 return new_offset
336
337 def nested_list_parse(self, block, input_offset, node, initial_state,
338 blank_finish,
339 blank_finish_state=None,
340 extra_settings={},
341 match_titles=False,
342 state_machine_class=None,
343 state_machine_kwargs=None):
344 """
345 Parse the input `block` with a nested state-machine rooted at `node`.
346
347 Create a new StateMachine rooted at `node` and run it over the
348 input `block` (see also `nested_parse()`).
349 Also keep track of optional intermediate blank lines and the
350 required final one.
351
352 Return new offset and a boolean indicating whether there was a
353 blank final line.
354 """
355 if state_machine_class is None:
356 state_machine_class = self.nested_sm
357 if state_machine_kwargs is None:
358 state_machine_kwargs = self.nested_sm_kwargs.copy()
359 state_machine_kwargs['initial_state'] = initial_state
360 state_machine = state_machine_class(debug=self.debug,
361 **state_machine_kwargs)
362 if blank_finish_state is None:
363 blank_finish_state = initial_state
364 state_machine.states[blank_finish_state].blank_finish = blank_finish
365 for key, value in extra_settings.items():
366 setattr(state_machine.states[initial_state], key, value)
367 state_machine.run(block, input_offset, memo=self.memo,
368 node=node, match_titles=match_titles)
369 blank_finish = state_machine.states[blank_finish_state].blank_finish
370 state_machine.unlink()
371 return state_machine.abs_line_offset(), blank_finish
372
373 def section(self, title, source, style, lineno, messages) -> None:
374 """Check for a valid subsection and create one if it checks out."""
375 if self.check_subsection(source, style, lineno):
376 self.new_subsection(title, lineno, messages)
377
378 def check_subsection(self, source, style, lineno) -> bool:
379 """
380 Check for a valid subsection header. Update section data in `memo`.
381
382 When a new section is reached that isn't a subsection of the current
383 section, set `self.parent` to the new section's parent section
384 (or the root node if the new section is a top-level section).
385 """
386 title_styles = self.memo.title_styles
387 parent_sections = self.parent.section_hierarchy()
388 # current section level: (0 root, 1 section, 2 subsection, ...)
389 oldlevel = len(parent_sections)
390 # new section level:
391 try: # check for existing title style
392 newlevel = title_styles.index(style) + 1
393 except ValueError: # new title style
394 newlevel = len(title_styles) + 1
395 # The new level must not be deeper than an immediate child
396 # of the current level:
397 if newlevel > oldlevel + 1:
398 styles = ' '.join('/'.join(style) for style in title_styles)
399 self.parent += self.reporter.error(
400 'Inconsistent title style:'
401 f' skip from level {oldlevel} to {newlevel}.',
402 nodes.literal_block('', source),
403 nodes.paragraph('', f'Established title styles: {styles}'),
404 line=lineno)
405 return False
406 # Update parent state:
407 if newlevel > len(title_styles):
408 title_styles.append(style)
409 self.memo.section_level = newlevel
410 if newlevel <= oldlevel:
411 # new section is sibling or higher up in the section hierarchy
412 self.parent = parent_sections[newlevel-1].parent
413 return True
414
415 def title_inconsistent(self, sourcetext, lineno):
416 # Ignored. Will be removed in Docutils 2.0.
417 error = self.reporter.error(
418 'Title level inconsistent:', nodes.literal_block('', sourcetext),
419 line=lineno)
420 return error
421
422 def new_subsection(self, title, lineno, messages):
423 """Append new subsection to document tree."""
424 section_node = nodes.section()
425 self.parent += section_node
426 textnodes, title_messages = self.inline_text(title, lineno)
427 titlenode = nodes.title(title, '', *textnodes)
428 name = normalize_name(titlenode.astext())
429 section_node['names'].append(name)
430 section_node += titlenode
431 section_node += messages
432 section_node += title_messages
433 self.document.note_implicit_target(section_node, section_node)
434 # Update state:
435 self.parent = section_node
436
437 def paragraph(self, lines, lineno):
438 """
439 Return a list (paragraph & messages) & a boolean: literal_block next?
440 """
441 data = '\n'.join(lines).rstrip()
442 if re.search(r'(?<!\\)(\\\\)*::$', data):
443 if len(data) == 2:
444 return [], 1
445 elif data[-3] in ' \n':
446 text = data[:-3].rstrip()
447 else:
448 text = data[:-1]
449 literalnext = 1
450 else:
451 text = data
452 literalnext = 0
453 textnodes, messages = self.inline_text(text, lineno)
454 p = nodes.paragraph(data, '', *textnodes)
455 p.source, p.line = self.state_machine.get_source_and_line(lineno)
456 return [p] + messages, literalnext
457
458 def inline_text(self, text, lineno):
459 """
460 Return 2 lists: nodes (text and inline elements), and system_messages.
461 """
462 nodes, messages = self.inliner.parse(text, lineno,
463 self.memo, self.parent)
464 return nodes, messages
465
466 def unindent_warning(self, node_name):
467 # the actual problem is one line below the current line
468 lineno = self.state_machine.abs_line_number() + 1
469 return self.reporter.warning('%s ends without a blank line; '
470 'unexpected unindent.' % node_name,
471 line=lineno)
472
473
474def build_regexp(definition, compile_patterns=True):
475 """
476 Build, compile and return a regular expression based on `definition`.
477
478 :Parameter: `definition`: a 4-tuple (group name, prefix, suffix, parts),
479 where "parts" is a list of regular expressions and/or regular
480 expression definitions to be joined into an or-group.
481 """
482 name, prefix, suffix, parts = definition
483 part_strings = []
484 for part in parts:
485 if isinstance(part, tuple):
486 part_strings.append(build_regexp(part, None))
487 else:
488 part_strings.append(part)
489 or_group = '|'.join(part_strings)
490 regexp = '%(prefix)s(?P<%(name)s>%(or_group)s)%(suffix)s' % locals()
491 if compile_patterns:
492 return re.compile(regexp)
493 else:
494 return regexp
495
496
497class Inliner:
498
499 """
500 Parse inline markup; call the `parse()` method.
501 """
502
503 def __init__(self) -> None:
504 self.implicit_dispatch = []
505 """List of (pattern, bound method) tuples, used by
506 `self.implicit_inline`."""
507
508 def init_customizations(self, settings) -> None:
509 # lookahead and look-behind expressions for inline markup rules
510 if getattr(settings, 'character_level_inline_markup', False):
511 start_string_prefix = '(^|(?<!\x00))'
512 end_string_suffix = ''
513 else:
514 start_string_prefix = ('(^|(?<=\\s|[%s%s]))' %
515 (punctuation_chars.openers,
516 punctuation_chars.delimiters))
517 end_string_suffix = ('($|(?=\\s|[\x00%s%s%s]))' %
518 (punctuation_chars.closing_delimiters,
519 punctuation_chars.delimiters,
520 punctuation_chars.closers))
521 args = locals().copy()
522 args.update(vars(self.__class__))
523
524 parts = ('initial_inline', start_string_prefix, '',
525 [
526 ('start', '', self.non_whitespace_after, # simple start-strings
527 [r'\*\*', # strong
528 r'\*(?!\*)', # emphasis but not strong
529 r'``', # literal
530 r'_`', # inline internal target
531 r'\|(?!\|)'] # substitution reference
532 ),
533 ('whole', '', end_string_suffix, # whole constructs
534 [ # reference name & end-string
535 r'(?P<refname>%s)(?P<refend>__?)' % self.simplename,
536 ('footnotelabel', r'\[', r'(?P<fnend>\]_)',
537 [r'[0-9]+', # manually numbered
538 r'\#(%s)?' % self.simplename, # auto-numbered (w/ label?)
539 r'\*', # auto-symbol
540 r'(?P<citationlabel>%s)' % self.simplename, # citation ref
541 ]
542 )
543 ]
544 ),
545 ('backquote', # interpreted text or phrase reference
546 '(?P<role>(:%s:)?)' % self.simplename, # optional role
547 self.non_whitespace_after,
548 ['`(?!`)'] # but not literal
549 )
550 ]
551 )
552 self.start_string_prefix = start_string_prefix
553 self.end_string_suffix = end_string_suffix
554 self.parts = parts
555
556 self.patterns = Struct(
557 initial=build_regexp(parts),
558 emphasis=re.compile(self.non_whitespace_escape_before
559 + r'(\*)' + end_string_suffix),
560 strong=re.compile(self.non_whitespace_escape_before
561 + r'(\*\*)' + end_string_suffix),
562 interpreted_or_phrase_ref=re.compile(
563 r"""
564 %(non_unescaped_whitespace_escape_before)s
565 (
566 `
567 (?P<suffix>
568 (?P<role>:%(simplename)s:)?
569 (?P<refend>__?)?
570 )
571 )
572 %(end_string_suffix)s
573 """ % args, re.VERBOSE),
574 embedded_link=re.compile(
575 r"""
576 (
577 (?:[ \n]+|^) # spaces or beginning of line/string
578 < # open bracket
579 %(non_whitespace_after)s
580 (([^<>]|\x00[<>])+) # anything but unescaped angle brackets
581 %(non_whitespace_escape_before)s
582 > # close bracket
583 )
584 $ # end of string
585 """ % args, re.VERBOSE),
586 literal=re.compile(self.non_whitespace_before + '(``)'
587 + end_string_suffix),
588 target=re.compile(self.non_whitespace_escape_before
589 + r'(`)' + end_string_suffix),
590 substitution_ref=re.compile(self.non_whitespace_escape_before
591 + r'(\|_{0,2})'
592 + end_string_suffix),
593 email=re.compile(self.email_pattern % args + '$',
594 re.VERBOSE),
595 uri=re.compile(
596 (r"""
597 %(start_string_prefix)s
598 (?P<whole>
599 (?P<absolute> # absolute URI
600 (?P<scheme> # scheme (http, ftp, mailto)
601 [a-zA-Z][a-zA-Z0-9.+-]*
602 )
603 :
604 (
605 ( # either:
606 (//?)? # hierarchical URI
607 %(uric)s* # URI characters
608 %(uri_end)s # final URI char
609 )
610 ( # optional query
611 \?%(uric)s*
612 %(uri_end)s
613 )?
614 ( # optional fragment
615 \#%(uric)s*
616 %(uri_end)s
617 )?
618 )
619 )
620 | # *OR*
621 (?P<email> # email address
622 """ + self.email_pattern + r"""
623 )
624 )
625 %(end_string_suffix)s
626 """) % args, re.VERBOSE),
627 pep=re.compile(
628 r"""
629 %(start_string_prefix)s
630 (
631 (pep-(?P<pepnum1>\d+)(.txt)?) # reference to source file
632 |
633 (PEP\s+(?P<pepnum2>\d+)) # reference by name
634 )
635 %(end_string_suffix)s""" % args, re.VERBOSE),
636 rfc=re.compile(
637 r"""
638 %(start_string_prefix)s
639 (RFC(-|\s+)?(?P<rfcnum>\d+))
640 %(end_string_suffix)s""" % args, re.VERBOSE))
641
642 self.implicit_dispatch.append((self.patterns.uri,
643 self.standalone_uri))
644 if settings.pep_references:
645 self.implicit_dispatch.append((self.patterns.pep,
646 self.pep_reference))
647 if settings.rfc_references:
648 self.implicit_dispatch.append((self.patterns.rfc,
649 self.rfc_reference))
650
651 def parse(self, text, lineno, memo, parent):
652 # Needs to be refactored for nested inline markup.
653 # Add nested_parse() method?
654 """
655 Return 2 lists: nodes (text and inline elements), and system_messages.
656
657 Using `self.patterns.initial`, a pattern which matches start-strings
658 (emphasis, strong, interpreted, phrase reference, literal,
659 substitution reference, and inline target) and complete constructs
660 (simple reference, footnote reference), search for a candidate. When
661 one is found, check for validity (e.g., not a quoted '*' character).
662 If valid, search for the corresponding end string if applicable, and
663 check it for validity. If not found or invalid, generate a warning
664 and ignore the start-string. Implicit inline markup (e.g. standalone
665 URIs) is found last.
666
667 :text: source string
668 :lineno: absolute line number, cf. `statemachine.get_source_and_line()`
669 """
670 self.document = memo.document
671 self.language = memo.language
672 self.reporter = self.document.reporter
673 self.parent = parent
674 pattern_search = self.patterns.initial.search
675 dispatch = self.dispatch
676 remaining = escape2null(text)
677 processed = []
678 unprocessed = []
679 messages = []
680 while remaining:
681 match = pattern_search(remaining)
682 if match:
683 groups = match.groupdict()
684 method = dispatch[groups['start'] or groups['backquote']
685 or groups['refend'] or groups['fnend']]
686 before, inlines, remaining, sysmessages = method(self, match,
687 lineno)
688 unprocessed.append(before)
689 messages += sysmessages
690 if inlines:
691 processed += self.implicit_inline(''.join(unprocessed),
692 lineno)
693 processed += inlines
694 unprocessed = []
695 else:
696 break
697 remaining = ''.join(unprocessed) + remaining
698 if remaining:
699 processed += self.implicit_inline(remaining, lineno)
700 return processed, messages
701
702 # Inline object recognition
703 # -------------------------
704 # See also init_customizations().
705 non_whitespace_before = r'(?<!\s)'
706 non_whitespace_escape_before = r'(?<![\s\x00])'
707 non_unescaped_whitespace_escape_before = r'(?<!(?<!\x00)[\s\x00])'
708 non_whitespace_after = r'(?!\s)'
709 # Alphanumerics with isolated internal [-._+:] chars (i.e. not 2 together):
710 simplename = r'(?:(?!_)\w)+(?:[-._+:](?:(?!_)\w)+)*'
711 # Valid URI characters (see RFC 2396 & RFC 2732);
712 # final \x00 allows backslash escapes in URIs:
713 uric = r"""[-_.!~*'()[\];/:@&=+$,%a-zA-Z0-9\x00]"""
714 # Delimiter indicating the end of a URI (not part of the URI):
715 uri_end_delim = r"""[>]"""
716 # Last URI character; same as uric but no punctuation:
717 urilast = r"""[_~*/=+a-zA-Z0-9]"""
718 # End of a URI (either 'urilast' or 'uric followed by a
719 # uri_end_delim'):
720 uri_end = r"""(?:%(urilast)s|%(uric)s(?=%(uri_end_delim)s))""" % locals()
721 emailc = r"""[-_!~*'{|}/#?^`&=+$%a-zA-Z0-9\x00]"""
722 email_pattern = r"""
723 %(emailc)s+(?:\.%(emailc)s+)* # name
724 (?<!\x00)@ # at
725 %(emailc)s+(?:\.%(emailc)s*)* # host
726 %(uri_end)s # final URI char
727 """
728
729 def quoted_start(self, match):
730 """Test if inline markup start-string is 'quoted'.
731
732 'Quoted' in this context means the start-string is enclosed in a pair
733 of matching opening/closing delimiters (not necessarily quotes)
734 or at the end of the match.
735 """
736 string = match.string
737 start = match.start()
738 if start == 0: # start-string at beginning of text
739 return False
740 prestart = string[start - 1]
741 try:
742 poststart = string[match.end()]
743 except IndexError: # start-string at end of text
744 return True # not "quoted" but no markup start-string either
745 return punctuation_chars.match_chars(prestart, poststart)
746
747 def inline_obj(self, match, lineno, end_pattern, nodeclass,
748 restore_backslashes=False):
749 string = match.string
750 matchstart = match.start('start')
751 matchend = match.end('start')
752 if self.quoted_start(match):
753 return string[:matchend], [], string[matchend:], [], ''
754 endmatch = end_pattern.search(string[matchend:])
755 if endmatch and endmatch.start(1): # 1 or more chars
756 text = endmatch.string[:endmatch.start(1)]
757 if restore_backslashes:
758 text = unescape(text, True)
759 textend = matchend + endmatch.end(1)
760 rawsource = unescape(string[matchstart:textend], True)
761 node = nodeclass(rawsource, text)
762 return (string[:matchstart], [node],
763 string[textend:], [], endmatch.group(1))
764 msg = self.reporter.warning(
765 'Inline %s start-string without end-string.'
766 % nodeclass.__name__, line=lineno)
767 text = unescape(string[matchstart:matchend], True)
768 prb = self.problematic(text, text, msg)
769 return string[:matchstart], [prb], string[matchend:], [msg], ''
770
771 def problematic(self, text, rawsource, message):
772 msgid = self.document.set_id(message, self.parent)
773 problematic = nodes.problematic(rawsource, text, refid=msgid)
774 prbid = self.document.set_id(problematic)
775 message.add_backref(prbid)
776 return problematic
777
778 def emphasis(self, match, lineno):
779 before, inlines, remaining, sysmessages, endstring = self.inline_obj(
780 match, lineno, self.patterns.emphasis, nodes.emphasis)
781 return before, inlines, remaining, sysmessages
782
783 def strong(self, match, lineno):
784 before, inlines, remaining, sysmessages, endstring = self.inline_obj(
785 match, lineno, self.patterns.strong, nodes.strong)
786 return before, inlines, remaining, sysmessages
787
788 def interpreted_or_phrase_ref(self, match, lineno):
789 end_pattern = self.patterns.interpreted_or_phrase_ref
790 string = match.string
791 matchstart = match.start('backquote')
792 matchend = match.end('backquote')
793 rolestart = match.start('role')
794 role = match.group('role')
795 position = ''
796 if role:
797 role = role[1:-1]
798 position = 'prefix'
799 elif self.quoted_start(match):
800 return string[:matchend], [], string[matchend:], []
801 endmatch = end_pattern.search(string[matchend:])
802 if endmatch and endmatch.start(1): # 1 or more chars
803 textend = matchend + endmatch.end()
804 if endmatch.group('role'):
805 if role:
806 msg = self.reporter.warning(
807 'Multiple roles in interpreted text (both '
808 'prefix and suffix present; only one allowed).',
809 line=lineno)
810 text = unescape(string[rolestart:textend], True)
811 prb = self.problematic(text, text, msg)
812 return string[:rolestart], [prb], string[textend:], [msg]
813 role = endmatch.group('suffix')[1:-1]
814 position = 'suffix'
815 escaped = endmatch.string[:endmatch.start(1)]
816 rawsource = unescape(string[matchstart:textend], True)
817 if rawsource[-1:] == '_':
818 if role:
819 msg = self.reporter.warning(
820 'Mismatch: both interpreted text role %s and '
821 'reference suffix.' % position, line=lineno)
822 text = unescape(string[rolestart:textend], True)
823 prb = self.problematic(text, text, msg)
824 return string[:rolestart], [prb], string[textend:], [msg]
825 return self.phrase_ref(string[:matchstart], string[textend:],
826 rawsource, escaped)
827 else:
828 rawsource = unescape(string[rolestart:textend], True)
829 nodelist, messages = self.interpreted(rawsource, escaped, role,
830 lineno)
831 return (string[:rolestart], nodelist,
832 string[textend:], messages)
833 msg = self.reporter.warning(
834 'Inline interpreted text or phrase reference start-string '
835 'without end-string.', line=lineno)
836 text = unescape(string[matchstart:matchend], True)
837 prb = self.problematic(text, text, msg)
838 return string[:matchstart], [prb], string[matchend:], [msg]
839
840 def phrase_ref(self, before, after, rawsource, escaped, text=None):
841 # `text` is ignored (since 0.16)
842 match = self.patterns.embedded_link.search(escaped)
843 if match: # embedded <URI> or <alias_>
844 text = escaped[:match.start(0)]
845 unescaped = unescape(text)
846 rawtext = unescape(text, True)
847 aliastext = match.group(2)
848 rawaliastext = unescape(aliastext, True)
849 underscore_escaped = rawaliastext.endswith(r'\_')
850 if (aliastext.endswith('_')
851 and not (underscore_escaped
852 or self.patterns.uri.match(aliastext))):
853 aliastype = 'name'
854 alias = normalize_name(unescape(aliastext[:-1]))
855 target = nodes.target(match.group(1), refname=alias)
856 target.indirect_reference_name = whitespace_normalize_name(
857 unescape(aliastext[:-1]))
858 else:
859 aliastype = 'uri'
860 # remove unescaped whitespace
861 alias_parts = split_escaped_whitespace(match.group(2))
862 alias = ' '.join(''.join(part.split())
863 for part in alias_parts)
864 alias = self.adjust_uri(unescape(alias))
865 if alias.endswith(r'\_'):
866 alias = alias[:-2] + '_'
867 target = nodes.target(match.group(1), refuri=alias)
868 target.referenced = 1
869 if not aliastext:
870 raise ApplicationError('problem with embedded link: %r'
871 % aliastext)
872 if not text:
873 text = alias
874 unescaped = unescape(text)
875 rawtext = rawaliastext
876 else:
877 text = escaped
878 unescaped = unescape(text)
879 target = None
880 rawtext = unescape(escaped, True)
881
882 refname = normalize_name(unescaped)
883 reference = nodes.reference(rawsource, text,
884 name=whitespace_normalize_name(unescaped))
885 reference[0].rawsource = rawtext
886
887 node_list = [reference]
888
889 if rawsource[-2:] == '__':
890 if target and (aliastype == 'name'):
891 reference['refname'] = alias
892 self.document.note_refname(reference)
893 # self.document.note_indirect_target(target) # required?
894 elif target and (aliastype == 'uri'):
895 reference['refuri'] = alias
896 else:
897 reference['anonymous'] = True
898 else:
899 if target:
900 target['names'].append(refname)
901 if aliastype == 'name':
902 reference['refname'] = alias
903 self.document.note_indirect_target(target)
904 self.document.note_refname(reference)
905 else:
906 reference['refuri'] = alias
907 # target.note_referenced_by(name=refname)
908 self.document.note_implicit_target(target, self.parent)
909 node_list.append(target)
910 else:
911 reference['refname'] = refname
912 self.document.note_refname(reference)
913 return before, node_list, after, []
914
915 def adjust_uri(self, uri):
916 match = self.patterns.email.match(uri)
917 if match:
918 return 'mailto:' + uri
919 else:
920 return uri
921
922 def interpreted(self, rawsource, text, role, lineno):
923 role_fn, messages = roles.role(role, self.language, lineno,
924 self.reporter)
925 if role_fn:
926 nodes, messages2 = role_fn(role, rawsource, text, lineno, self)
927 return nodes, messages + messages2
928 else:
929 msg = self.reporter.error(
930 'Unknown interpreted text role "%s".' % role,
931 line=lineno)
932 return ([self.problematic(rawsource, rawsource, msg)],
933 messages + [msg])
934
935 def literal(self, match, lineno):
936 before, inlines, remaining, sysmessages, endstring = self.inline_obj(
937 match, lineno, self.patterns.literal, nodes.literal,
938 restore_backslashes=True)
939 return before, inlines, remaining, sysmessages
940
941 def inline_internal_target(self, match, lineno):
942 before, inlines, remaining, sysmessages, endstring = self.inline_obj(
943 match, lineno, self.patterns.target, nodes.target)
944 if inlines and isinstance(inlines[0], nodes.target):
945 assert len(inlines) == 1
946 target = inlines[0]
947 name = normalize_name(target.astext())
948 target['names'].append(name)
949 self.document.note_explicit_target(target, self.parent)
950 return before, inlines, remaining, sysmessages
951
952 def substitution_reference(self, match, lineno):
953 before, inlines, remaining, sysmessages, endstring = self.inline_obj(
954 match, lineno, self.patterns.substitution_ref,
955 nodes.substitution_reference)
956 if len(inlines) == 1:
957 subref_node = inlines[0]
958 if isinstance(subref_node, nodes.substitution_reference):
959 subref_text = subref_node.astext()
960 self.document.note_substitution_ref(subref_node, subref_text)
961 if endstring[-1:] == '_':
962 reference_node = nodes.reference(
963 '|%s%s' % (subref_text, endstring), '')
964 if endstring[-2:] == '__':
965 reference_node['anonymous'] = True
966 else:
967 reference_node['refname'] = normalize_name(subref_text)
968 self.document.note_refname(reference_node)
969 reference_node += subref_node
970 inlines = [reference_node]
971 return before, inlines, remaining, sysmessages
972
973 def footnote_reference(self, match, lineno):
974 """
975 Handles `nodes.footnote_reference` and `nodes.citation_reference`
976 elements.
977 """
978 label = match.group('footnotelabel')
979 refname = normalize_name(label)
980 string = match.string
981 before = string[:match.start('whole')]
982 remaining = string[match.end('whole'):]
983 if match.group('citationlabel'):
984 refnode = nodes.citation_reference('[%s]_' % label,
985 refname=refname)
986 refnode += nodes.Text(label)
987 self.document.note_citation_ref(refnode)
988 else:
989 refnode = nodes.footnote_reference('[%s]_' % label)
990 if refname[0] == '#':
991 refname = refname[1:]
992 refnode['auto'] = 1
993 self.document.note_autofootnote_ref(refnode)
994 elif refname == '*':
995 refname = ''
996 refnode['auto'] = '*'
997 self.document.note_symbol_footnote_ref(
998 refnode)
999 else:
1000 refnode += nodes.Text(label)
1001 if refname:
1002 refnode['refname'] = refname
1003 self.document.note_footnote_ref(refnode)
1004 if utils.get_trim_footnote_ref_space(self.document.settings):
1005 before = before.rstrip()
1006 return before, [refnode], remaining, []
1007
1008 def reference(self, match, lineno, anonymous=False):
1009 referencename = match.group('refname')
1010 refname = normalize_name(referencename)
1011 referencenode = nodes.reference(
1012 referencename + match.group('refend'), referencename,
1013 name=whitespace_normalize_name(referencename))
1014 referencenode[0].rawsource = referencename
1015 if anonymous:
1016 referencenode['anonymous'] = True
1017 else:
1018 referencenode['refname'] = refname
1019 self.document.note_refname(referencenode)
1020 string = match.string
1021 matchstart = match.start('whole')
1022 matchend = match.end('whole')
1023 return string[:matchstart], [referencenode], string[matchend:], []
1024
1025 def anonymous_reference(self, match, lineno):
1026 return self.reference(match, lineno, anonymous=True)
1027
1028 def standalone_uri(self, match, lineno):
1029 if (not match.group('scheme')
1030 or match.group('scheme').lower() in urischemes.schemes):
1031 if match.group('email'):
1032 addscheme = 'mailto:'
1033 else:
1034 addscheme = ''
1035 text = match.group('whole')
1036 refuri = addscheme + unescape(text)
1037 reference = nodes.reference(unescape(text, True), text,
1038 refuri=refuri)
1039 return [reference]
1040 else: # not a valid scheme
1041 raise MarkupMismatch
1042
1043 def pep_reference(self, match, lineno):
1044 text = match.group(0)
1045 if text.startswith('pep-'):
1046 pepnum = int(unescape(match.group('pepnum1')))
1047 elif text.startswith('PEP'):
1048 pepnum = int(unescape(match.group('pepnum2')))
1049 else:
1050 raise MarkupMismatch
1051 ref = (self.document.settings.pep_base_url
1052 + self.document.settings.pep_file_url_template % pepnum)
1053 return [nodes.reference(unescape(text, True), text, refuri=ref)]
1054
1055 rfc_url = 'rfc%d.html'
1056
1057 def rfc_reference(self, match, lineno):
1058 text = match.group(0)
1059 if text.startswith('RFC'):
1060 rfcnum = int(unescape(match.group('rfcnum')))
1061 ref = self.document.settings.rfc_base_url + self.rfc_url % rfcnum
1062 else:
1063 raise MarkupMismatch
1064 return [nodes.reference(unescape(text, True), text, refuri=ref)]
1065
1066 def implicit_inline(self, text, lineno):
1067 """
1068 Check each of the patterns in `self.implicit_dispatch` for a match,
1069 and dispatch to the stored method for the pattern. Recursively check
1070 the text before and after the match. Return a list of `nodes.Text`
1071 and inline element nodes.
1072 """
1073 if not text:
1074 return []
1075 for pattern, method in self.implicit_dispatch:
1076 match = pattern.search(text)
1077 if match:
1078 try:
1079 # Must recurse on strings before *and* after the match;
1080 # there may be multiple patterns.
1081 return (self.implicit_inline(text[:match.start()], lineno)
1082 + method(match, lineno)
1083 + self.implicit_inline(text[match.end():], lineno))
1084 except MarkupMismatch:
1085 pass
1086 return [nodes.Text(text)]
1087
1088 dispatch = {'*': emphasis,
1089 '**': strong,
1090 '`': interpreted_or_phrase_ref,
1091 '``': literal,
1092 '_`': inline_internal_target,
1093 ']_': footnote_reference,
1094 '|': substitution_reference,
1095 '_': reference,
1096 '__': anonymous_reference}
1097
1098
1099def _loweralpha_to_int(s, _zero=(ord('a')-1)):
1100 return ord(s) - _zero
1101
1102
1103def _upperalpha_to_int(s, _zero=(ord('A')-1)):
1104 return ord(s) - _zero
1105
1106
1107class Body(RSTState):
1108
1109 """
1110 Generic classifier of the first line of a block.
1111 """
1112
1113 double_width_pad_char = tableparser.TableParser.double_width_pad_char
1114 """Padding character for East Asian double-width text."""
1115
1116 enum = Struct()
1117 """Enumerated list parsing information."""
1118
1119 enum.formatinfo = {
1120 'parens': Struct(prefix='(', suffix=')', start=1, end=-1),
1121 'rparen': Struct(prefix='', suffix=')', start=0, end=-1),
1122 'period': Struct(prefix='', suffix='.', start=0, end=-1)}
1123 enum.formats = enum.formatinfo.keys()
1124 enum.sequences = ['arabic', 'loweralpha', 'upperalpha',
1125 'lowerroman', 'upperroman'] # ORDERED!
1126 enum.sequencepats = {'arabic': '[0-9]+',
1127 'loweralpha': '[a-z]',
1128 'upperalpha': '[A-Z]',
1129 'lowerroman': '[ivxlcdm]+',
1130 'upperroman': '[IVXLCDM]+'}
1131 enum.converters = {'arabic': int,
1132 'loweralpha': _loweralpha_to_int,
1133 'upperalpha': _upperalpha_to_int,
1134 'lowerroman': RomanNumeral.from_string,
1135 'upperroman': RomanNumeral.from_string}
1136
1137 enum.sequenceregexps = {}
1138 for sequence in enum.sequences:
1139 enum.sequenceregexps[sequence] = re.compile(
1140 enum.sequencepats[sequence] + '$')
1141
1142 grid_table_top_pat = re.compile(r'\+-[-+]+-\+ *$')
1143 """Matches the top (& bottom) of a full table)."""
1144
1145 simple_table_top_pat = re.compile('=+( +=+)+ *$')
1146 """Matches the top of a simple table."""
1147
1148 simple_table_border_pat = re.compile('=+[ =]*$')
1149 """Matches the bottom & header bottom of a simple table."""
1150
1151 pats = {}
1152 """Fragments of patterns used by transitions."""
1153
1154 pats['nonalphanum7bit'] = '[!-/:-@[-`{-~]'
1155 pats['alpha'] = '[a-zA-Z]'
1156 pats['alphanum'] = '[a-zA-Z0-9]'
1157 pats['alphanumplus'] = '[a-zA-Z0-9_-]'
1158 pats['enum'] = ('(%(arabic)s|%(loweralpha)s|%(upperalpha)s|%(lowerroman)s'
1159 '|%(upperroman)s|#)' % enum.sequencepats)
1160 pats['optname'] = '%(alphanum)s%(alphanumplus)s*' % pats
1161 # @@@ Loosen up the pattern? Allow Unicode?
1162 pats['optarg'] = '(%(alpha)s%(alphanumplus)s*|<[^<>]+>)' % pats
1163 pats['shortopt'] = r'(-|\+)%(alphanum)s( ?%(optarg)s)?' % pats
1164 pats['longopt'] = r'(--|/)%(optname)s([ =]%(optarg)s)?' % pats
1165 pats['option'] = r'(%(shortopt)s|%(longopt)s)' % pats
1166
1167 for format in enum.formats:
1168 pats[format] = '(?P<%s>%s%s%s)' % (
1169 format, re.escape(enum.formatinfo[format].prefix),
1170 pats['enum'], re.escape(enum.formatinfo[format].suffix))
1171
1172 patterns = {
1173 'bullet': '[-+*\u2022\u2023\u2043]( +|$)',
1174 'enumerator': r'(%(parens)s|%(rparen)s|%(period)s)( +|$)' % pats,
1175 'field_marker': r':(?![: ])([^:\\]|\\.|:(?!([ `]|$)))*(?<! ):( +|$)',
1176 'option_marker': r'%(option)s(, %(option)s)*( +| ?$)' % pats,
1177 'doctest': r'>>>( +|$)',
1178 'line_block': r'\|( +|$)',
1179 'grid_table_top': grid_table_top_pat,
1180 'simple_table_top': simple_table_top_pat,
1181 'explicit_markup': r'\.\.( +|$)',
1182 'anonymous': r'__( +|$)',
1183 'line': r'(%(nonalphanum7bit)s)\1* *$' % pats,
1184 'text': r''}
1185 initial_transitions = (
1186 'bullet',
1187 'enumerator',
1188 'field_marker',
1189 'option_marker',
1190 'doctest',
1191 'line_block',
1192 'grid_table_top',
1193 'simple_table_top',
1194 'explicit_markup',
1195 'anonymous',
1196 'line',
1197 'text')
1198
1199 def indent(self, match, context, next_state):
1200 """Block quote."""
1201 (indented, indent, line_offset, blank_finish
1202 ) = self.state_machine.get_indented()
1203 elements = self.block_quote(indented, line_offset)
1204 self.parent += elements
1205 if not blank_finish:
1206 self.parent += self.unindent_warning('Block quote')
1207 return context, next_state, []
1208
1209 def block_quote(self, indented, line_offset):
1210 elements = []
1211 while indented:
1212 blockquote = nodes.block_quote(rawsource='\n'.join(indented))
1213 (blockquote.source, blockquote.line
1214 ) = self.state_machine.get_source_and_line(line_offset+1)
1215 (blockquote_lines,
1216 attribution_lines,
1217 attribution_offset,
1218 indented,
1219 new_line_offset) = self.split_attribution(indented, line_offset)
1220 self.nested_parse(blockquote_lines, line_offset, blockquote)
1221 elements.append(blockquote)
1222 if attribution_lines:
1223 attribution, messages = self.parse_attribution(
1224 attribution_lines, line_offset+attribution_offset)
1225 blockquote += attribution
1226 elements += messages
1227 line_offset = new_line_offset
1228 while indented and not indented[0]:
1229 indented = indented[1:]
1230 line_offset += 1
1231 return elements
1232
1233 # U+2014 is an em-dash:
1234 attribution_pattern = re.compile('(---?(?!-)|\u2014) *(?=[^ \\n])')
1235
1236 def split_attribution(self, indented, line_offset):
1237 """
1238 Check for a block quote attribution and split it off:
1239
1240 * First line after a blank line must begin with a dash ("--", "---",
1241 em-dash; matches `self.attribution_pattern`).
1242 * Every line after that must have consistent indentation.
1243 * Attributions must be preceded by block quote content.
1244
1245 Return a tuple of: (block quote content lines, attribution lines,
1246 attribution offset, remaining indented lines, remaining lines offset).
1247 """
1248 blank = None
1249 nonblank_seen = False
1250 for i in range(len(indented)):
1251 line = indented[i].rstrip()
1252 if line:
1253 if nonblank_seen and blank == i - 1: # last line blank
1254 match = self.attribution_pattern.match(line)
1255 if match:
1256 attribution_end, indent = self.check_attribution(
1257 indented, i)
1258 if attribution_end:
1259 a_lines = indented[i:attribution_end]
1260 a_lines.trim_left(match.end(), end=1)
1261 a_lines.trim_left(indent, start=1)
1262 return (indented[:i], a_lines,
1263 i, indented[attribution_end:],
1264 line_offset + attribution_end)
1265 nonblank_seen = True
1266 else:
1267 blank = i
1268 else:
1269 return indented, None, None, None, None
1270
1271 def check_attribution(self, indented, attribution_start):
1272 """
1273 Check attribution shape.
1274 Return the index past the end of the attribution, and the indent.
1275 """
1276 indent = None
1277 i = attribution_start + 1
1278 for i in range(attribution_start + 1, len(indented)):
1279 line = indented[i].rstrip()
1280 if not line:
1281 break
1282 if indent is None:
1283 indent = len(line) - len(line.lstrip())
1284 elif len(line) - len(line.lstrip()) != indent:
1285 return None, None # bad shape; not an attribution
1286 else:
1287 # return index of line after last attribution line:
1288 i += 1
1289 return i, (indent or 0)
1290
1291 def parse_attribution(self, indented, line_offset):
1292 text = '\n'.join(indented).rstrip()
1293 lineno = 1 + line_offset # line_offset is zero-based
1294 textnodes, messages = self.inline_text(text, lineno)
1295 node = nodes.attribution(text, '', *textnodes)
1296 node.source, node.line = self.state_machine.get_source_and_line(lineno)
1297 return node, messages
1298
1299 def bullet(self, match, context, next_state):
1300 """Bullet list item."""
1301 ul = nodes.bullet_list()
1302 ul.source, ul.line = self.state_machine.get_source_and_line()
1303 self.parent += ul
1304 ul['bullet'] = match.string[0]
1305 i, blank_finish = self.list_item(match.end())
1306 ul += i
1307 offset = self.state_machine.line_offset + 1 # next line
1308 new_line_offset, blank_finish = self.nested_list_parse(
1309 self.state_machine.input_lines[offset:],
1310 input_offset=self.state_machine.abs_line_offset() + 1,
1311 node=ul, initial_state='BulletList',
1312 blank_finish=blank_finish)
1313 self.goto_line(new_line_offset)
1314 if not blank_finish:
1315 self.parent += self.unindent_warning('Bullet list')
1316 return [], next_state, []
1317
1318 def list_item(self, indent):
1319 src, srcline = self.state_machine.get_source_and_line()
1320 if self.state_machine.line[indent:]:
1321 indented, line_offset, blank_finish = (
1322 self.state_machine.get_known_indented(indent))
1323 else:
1324 indented, indent, line_offset, blank_finish = (
1325 self.state_machine.get_first_known_indented(indent))
1326 listitem = nodes.list_item('\n'.join(indented))
1327 listitem.source, listitem.line = src, srcline
1328 if indented:
1329 self.nested_parse(indented, input_offset=line_offset,
1330 node=listitem)
1331 return listitem, blank_finish
1332
1333 def enumerator(self, match, context, next_state):
1334 """Enumerated List Item"""
1335 format, sequence, text, ordinal = self.parse_enumerator(match)
1336 if not self.is_enumerated_list_item(ordinal, sequence, format):
1337 raise statemachine.TransitionCorrection('text')
1338 enumlist = nodes.enumerated_list()
1339 (enumlist.source,
1340 enumlist.line) = self.state_machine.get_source_and_line()
1341 self.parent += enumlist
1342 if sequence == '#':
1343 enumlist['enumtype'] = 'arabic'
1344 else:
1345 enumlist['enumtype'] = sequence
1346 enumlist['prefix'] = self.enum.formatinfo[format].prefix
1347 enumlist['suffix'] = self.enum.formatinfo[format].suffix
1348 if ordinal != 1:
1349 enumlist['start'] = ordinal
1350 msg = self.reporter.info(
1351 'Enumerated list start value not ordinal-1: "%s" (ordinal %s)'
1352 % (text, ordinal), base_node=enumlist)
1353 self.parent += msg
1354 listitem, blank_finish = self.list_item(match.end())
1355 enumlist += listitem
1356 offset = self.state_machine.line_offset + 1 # next line
1357 newline_offset, blank_finish = self.nested_list_parse(
1358 self.state_machine.input_lines[offset:],
1359 input_offset=self.state_machine.abs_line_offset() + 1,
1360 node=enumlist, initial_state='EnumeratedList',
1361 blank_finish=blank_finish,
1362 extra_settings={'lastordinal': ordinal,
1363 'format': format,
1364 'auto': sequence == '#'})
1365 self.goto_line(newline_offset)
1366 if not blank_finish:
1367 self.parent += self.unindent_warning('Enumerated list')
1368 return [], next_state, []
1369
1370 def parse_enumerator(self, match, expected_sequence=None):
1371 """
1372 Analyze an enumerator and return the results.
1373
1374 :Return:
1375 - the enumerator format ('period', 'parens', or 'rparen'),
1376 - the sequence used ('arabic', 'loweralpha', 'upperroman', etc.),
1377 - the text of the enumerator, stripped of formatting, and
1378 - the ordinal value of the enumerator ('a' -> 1, 'ii' -> 2, etc.;
1379 ``None`` is returned for invalid enumerator text).
1380
1381 The enumerator format has already been determined by the regular
1382 expression match. If `expected_sequence` is given, that sequence is
1383 tried first. If not, we check for Roman numeral 1. This way,
1384 single-character Roman numerals (which are also alphabetical) can be
1385 matched. If no sequence has been matched, all sequences are checked in
1386 order.
1387 """
1388 groupdict = match.groupdict()
1389 sequence = ''
1390 for format in self.enum.formats:
1391 if groupdict[format]: # was this the format matched?
1392 break # yes; keep `format`
1393 else: # shouldn't happen
1394 raise ParserError('enumerator format not matched')
1395 text = groupdict[format][self.enum.formatinfo[format].start # noqa: E203,E501
1396 : self.enum.formatinfo[format].end]
1397 if text == '#':
1398 sequence = '#'
1399 elif expected_sequence:
1400 try:
1401 if self.enum.sequenceregexps[expected_sequence].match(text):
1402 sequence = expected_sequence
1403 except KeyError: # shouldn't happen
1404 raise ParserError('unknown enumerator sequence: %s'
1405 % sequence)
1406 elif text == 'i':
1407 sequence = 'lowerroman'
1408 elif text == 'I':
1409 sequence = 'upperroman'
1410 if not sequence:
1411 for sequence in self.enum.sequences:
1412 if self.enum.sequenceregexps[sequence].match(text):
1413 break
1414 else: # shouldn't happen
1415 raise ParserError('enumerator sequence not matched')
1416 if sequence == '#':
1417 ordinal = 1
1418 else:
1419 try:
1420 ordinal = int(self.enum.converters[sequence](text))
1421 except InvalidRomanNumeralError:
1422 ordinal = None
1423 return format, sequence, text, ordinal
1424
1425 def is_enumerated_list_item(self, ordinal, sequence, format):
1426 """
1427 Check validity based on the ordinal value and the second line.
1428
1429 Return true if the ordinal is valid and the second line is blank,
1430 indented, or starts with the next enumerator or an auto-enumerator.
1431 """
1432 if ordinal is None:
1433 return None
1434 try:
1435 next_line = self.state_machine.next_line()
1436 except EOFError: # end of input lines
1437 self.state_machine.previous_line()
1438 return 1
1439 else:
1440 self.state_machine.previous_line()
1441 if not next_line[:1].strip(): # blank or indented
1442 return 1
1443 result = self.make_enumerator(ordinal + 1, sequence, format)
1444 if result:
1445 next_enumerator, auto_enumerator = result
1446 try:
1447 if next_line.startswith((next_enumerator, auto_enumerator)):
1448 return 1
1449 except TypeError:
1450 pass
1451 return None
1452
1453 def make_enumerator(self, ordinal, sequence, format):
1454 """
1455 Construct and return the next enumerated list item marker, and an
1456 auto-enumerator ("#" instead of the regular enumerator).
1457
1458 Return ``None`` for invalid (out of range) ordinals.
1459 """
1460 if sequence == '#':
1461 enumerator = '#'
1462 elif sequence == 'arabic':
1463 enumerator = str(ordinal)
1464 else:
1465 if sequence.endswith('alpha'):
1466 if ordinal > 26:
1467 return None
1468 enumerator = chr(ordinal + ord('a') - 1)
1469 elif sequence.endswith('roman'):
1470 try:
1471 enumerator = RomanNumeral(ordinal).to_uppercase()
1472 except TypeError:
1473 return None
1474 else: # shouldn't happen
1475 raise ParserError('unknown enumerator sequence: "%s"'
1476 % sequence)
1477 if sequence.startswith('lower'):
1478 enumerator = enumerator.lower()
1479 elif sequence.startswith('upper'):
1480 enumerator = enumerator.upper()
1481 else: # shouldn't happen
1482 raise ParserError('unknown enumerator sequence: "%s"'
1483 % sequence)
1484 formatinfo = self.enum.formatinfo[format]
1485 next_enumerator = (formatinfo.prefix + enumerator + formatinfo.suffix
1486 + ' ')
1487 auto_enumerator = formatinfo.prefix + '#' + formatinfo.suffix + ' '
1488 return next_enumerator, auto_enumerator
1489
1490 def field_marker(self, match, context, next_state):
1491 """Field list item."""
1492 field_list = nodes.field_list()
1493 self.parent += field_list
1494 field, blank_finish = self.field(match)
1495 field_list += field
1496 offset = self.state_machine.line_offset + 1 # next line
1497 newline_offset, blank_finish = self.nested_list_parse(
1498 self.state_machine.input_lines[offset:],
1499 input_offset=self.state_machine.abs_line_offset() + 1,
1500 node=field_list, initial_state='FieldList',
1501 blank_finish=blank_finish)
1502 self.goto_line(newline_offset)
1503 if not blank_finish:
1504 self.parent += self.unindent_warning('Field list')
1505 return [], next_state, []
1506
1507 def field(self, match):
1508 name = self.parse_field_marker(match)
1509 src, srcline = self.state_machine.get_source_and_line()
1510 lineno = self.state_machine.abs_line_number()
1511 (indented, indent, line_offset, blank_finish
1512 ) = self.state_machine.get_first_known_indented(match.end())
1513 field_node = nodes.field()
1514 field_node.source = src
1515 field_node.line = srcline
1516 name_nodes, name_messages = self.inline_text(name, lineno)
1517 field_node += nodes.field_name(name, '', *name_nodes)
1518 field_body = nodes.field_body('\n'.join(indented), *name_messages)
1519 field_node += field_body
1520 if indented:
1521 self.parse_field_body(indented, line_offset, field_body)
1522 return field_node, blank_finish
1523
1524 def parse_field_marker(self, match):
1525 """Extract & return field name from a field marker match."""
1526 field = match.group()[1:] # strip off leading ':'
1527 field = field[:field.rfind(':')] # strip off trailing ':' etc.
1528 return field
1529
1530 def parse_field_body(self, indented, offset, node) -> None:
1531 self.nested_parse(indented, input_offset=offset, node=node)
1532
1533 def option_marker(self, match, context, next_state):
1534 """Option list item."""
1535 optionlist = nodes.option_list()
1536 (optionlist.source, optionlist.line
1537 ) = self.state_machine.get_source_and_line()
1538 try:
1539 listitem, blank_finish = self.option_list_item(match)
1540 except MarkupError as error:
1541 # This shouldn't happen; pattern won't match.
1542 msg = self.reporter.error('Invalid option list marker: %s'
1543 % error)
1544 self.parent += msg
1545 (indented, indent, line_offset, blank_finish
1546 ) = self.state_machine.get_first_known_indented(match.end())
1547 elements = self.block_quote(indented, line_offset)
1548 self.parent += elements
1549 if not blank_finish:
1550 self.parent += self.unindent_warning('Option list')
1551 return [], next_state, []
1552 self.parent += optionlist
1553 optionlist += listitem
1554 offset = self.state_machine.line_offset + 1 # next line
1555 newline_offset, blank_finish = self.nested_list_parse(
1556 self.state_machine.input_lines[offset:],
1557 input_offset=self.state_machine.abs_line_offset() + 1,
1558 node=optionlist, initial_state='OptionList',
1559 blank_finish=blank_finish)
1560 self.goto_line(newline_offset)
1561 if not blank_finish:
1562 self.parent += self.unindent_warning('Option list')
1563 return [], next_state, []
1564
1565 def option_list_item(self, match):
1566 offset = self.state_machine.abs_line_offset()
1567 options = self.parse_option_marker(match)
1568 (indented, indent, line_offset, blank_finish
1569 ) = self.state_machine.get_first_known_indented(match.end())
1570 if not indented: # not an option list item
1571 self.goto_line(offset)
1572 raise statemachine.TransitionCorrection('text')
1573 option_group = nodes.option_group('', *options)
1574 description = nodes.description('\n'.join(indented))
1575 option_list_item = nodes.option_list_item('', option_group,
1576 description)
1577 if indented:
1578 self.nested_parse(indented, input_offset=line_offset,
1579 node=description)
1580 return option_list_item, blank_finish
1581
1582 def parse_option_marker(self, match):
1583 """
1584 Return a list of `node.option` and `node.option_argument` objects,
1585 parsed from an option marker match.
1586
1587 :Exception: `MarkupError` for invalid option markers.
1588 """
1589 optlist = []
1590 # split at ", ", except inside < > (complex arguments)
1591 optionstrings = re.split(r', (?![^<]*>)', match.group().rstrip())
1592 for optionstring in optionstrings:
1593 tokens = optionstring.split()
1594 delimiter = ' '
1595 firstopt = tokens[0].split('=', 1)
1596 if len(firstopt) > 1:
1597 # "--opt=value" form
1598 tokens[:1] = firstopt
1599 delimiter = '='
1600 elif (len(tokens[0]) > 2
1601 and ((tokens[0].startswith('-')
1602 and not tokens[0].startswith('--'))
1603 or tokens[0].startswith('+'))):
1604 # "-ovalue" form
1605 tokens[:1] = [tokens[0][:2], tokens[0][2:]]
1606 delimiter = ''
1607 if len(tokens) > 1 and (tokens[1].startswith('<')
1608 and tokens[-1].endswith('>')):
1609 # "-o <value1 value2>" form; join all values into one token
1610 tokens[1:] = [' '.join(tokens[1:])]
1611 if 0 < len(tokens) <= 2:
1612 option = nodes.option(optionstring)
1613 option += nodes.option_string(tokens[0], tokens[0])
1614 if len(tokens) > 1:
1615 option += nodes.option_argument(tokens[1], tokens[1],
1616 delimiter=delimiter)
1617 optlist.append(option)
1618 else:
1619 raise MarkupError(
1620 'wrong number of option tokens (=%s), should be 1 or 2: '
1621 '"%s"' % (len(tokens), optionstring))
1622 return optlist
1623
1624 def doctest(self, match, context, next_state):
1625 line = self.document.current_line
1626 data = '\n'.join(self.state_machine.get_text_block())
1627 # TODO: Parse with `directives.body.CodeBlock` with
1628 # argument 'pycon' (Python Console) in Docutils 1.0.
1629 n = nodes.doctest_block(data, data)
1630 n.line = line
1631 self.parent += n
1632 return [], next_state, []
1633
1634 def line_block(self, match, context, next_state):
1635 """First line of a line block."""
1636 block = nodes.line_block()
1637 self.parent += block
1638 lineno = self.state_machine.abs_line_number()
1639 (block.source,
1640 block.line) = self.state_machine.get_source_and_line(lineno)
1641 line, messages, blank_finish = self.line_block_line(match, lineno)
1642 block += line
1643 self.parent += messages
1644 if not blank_finish:
1645 offset = self.state_machine.line_offset + 1 # next line
1646 new_line_offset, blank_finish = self.nested_list_parse(
1647 self.state_machine.input_lines[offset:],
1648 input_offset=self.state_machine.abs_line_offset() + 1,
1649 node=block, initial_state='LineBlock',
1650 blank_finish=False)
1651 self.goto_line(new_line_offset)
1652 if not blank_finish:
1653 self.parent += self.reporter.warning(
1654 'Line block ends without a blank line.',
1655 line=lineno+1)
1656 if len(block):
1657 if block[0].indent is None:
1658 block[0].indent = 0
1659 self.nest_line_block_lines(block)
1660 return [], next_state, []
1661
1662 def line_block_line(self, match, lineno):
1663 """Return one line element of a line_block."""
1664 (indented, indent, line_offset, blank_finish
1665 ) = self.state_machine.get_first_known_indented(match.end(),
1666 until_blank=True)
1667 text = '\n'.join(indented)
1668 text_nodes, messages = self.inline_text(text, lineno)
1669 line = nodes.line(text, '', *text_nodes)
1670 (line.source,
1671 line.line) = self.state_machine.get_source_and_line(lineno)
1672 if match.string.rstrip() != '|': # not empty
1673 line.indent = len(match.group(1)) - 1
1674 return line, messages, blank_finish
1675
1676 def nest_line_block_lines(self, block) -> None:
1677 for index in range(1, len(block)):
1678 if block[index].indent is None:
1679 block[index].indent = block[index - 1].indent
1680 self.nest_line_block_segment(block)
1681
1682 def nest_line_block_segment(self, block) -> None:
1683 indents = [item.indent for item in block]
1684 least = min(indents)
1685 new_items = []
1686 new_block = nodes.line_block()
1687 for item in block:
1688 if item.indent > least:
1689 new_block.append(item)
1690 else:
1691 if len(new_block):
1692 self.nest_line_block_segment(new_block)
1693 new_items.append(new_block)
1694 new_block = nodes.line_block()
1695 new_items.append(item)
1696 if len(new_block):
1697 self.nest_line_block_segment(new_block)
1698 new_items.append(new_block)
1699 block[:] = new_items
1700
1701 def grid_table_top(self, match, context, next_state):
1702 """Top border of a full table."""
1703 return self.table_top(match, context, next_state,
1704 self.isolate_grid_table,
1705 tableparser.GridTableParser)
1706
1707 def simple_table_top(self, match, context, next_state):
1708 """Top border of a simple table."""
1709 return self.table_top(match, context, next_state,
1710 self.isolate_simple_table,
1711 tableparser.SimpleTableParser)
1712
1713 def table_top(self, match, context, next_state,
1714 isolate_function, parser_class):
1715 """Top border of a generic table."""
1716 nodelist, blank_finish = self.table(isolate_function, parser_class)
1717 self.parent += nodelist
1718 if not blank_finish:
1719 msg = self.reporter.warning(
1720 'Blank line required after table.',
1721 line=self.state_machine.abs_line_number()+1)
1722 self.parent += msg
1723 return [], next_state, []
1724
1725 def table(self, isolate_function, parser_class):
1726 """Parse a table."""
1727 block, messages, blank_finish = isolate_function()
1728 if block:
1729 try:
1730 parser = parser_class()
1731 tabledata = parser.parse(block)
1732 tableline = (self.state_machine.abs_line_number() - len(block)
1733 + 1)
1734 table = self.build_table(tabledata, tableline)
1735 nodelist = [table] + messages
1736 except tableparser.TableMarkupError as err:
1737 nodelist = self.malformed_table(block, ' '.join(err.args),
1738 offset=err.offset) + messages
1739 else:
1740 nodelist = messages
1741 return nodelist, blank_finish
1742
1743 def isolate_grid_table(self):
1744 messages = []
1745 blank_finish = True
1746 try:
1747 block = self.state_machine.get_text_block(flush_left=True)
1748 except statemachine.UnexpectedIndentationError as err:
1749 block, src, srcline = err.args
1750 messages.append(self.reporter.error('Unexpected indentation.',
1751 source=src, line=srcline))
1752 blank_finish = False
1753 block.disconnect()
1754 # for East Asian chars:
1755 block.pad_double_width(self.double_width_pad_char)
1756 width = len(block[0].strip())
1757 for i in range(len(block)):
1758 block[i] = block[i].strip()
1759 if block[i][0] not in '+|': # check left edge
1760 blank_finish = False
1761 self.state_machine.previous_line(len(block) - i)
1762 del block[i:]
1763 break
1764 if not self.grid_table_top_pat.match(block[-1]): # find bottom
1765 # from second-last to third line of table:
1766 for i in range(len(block) - 2, 1, -1):
1767 if self.grid_table_top_pat.match(block[i]):
1768 self.state_machine.previous_line(len(block) - i + 1)
1769 del block[i+1:]
1770 blank_finish = False
1771 break
1772 else:
1773 detail = 'Bottom border missing or corrupt.'
1774 messages.extend(self.malformed_table(block, detail, i))
1775 return [], messages, blank_finish
1776 for i in range(len(block)): # check right edge
1777 if len(block[i]) != width or block[i][-1] not in '+|':
1778 detail = 'Right border not aligned or missing.'
1779 messages.extend(self.malformed_table(block, detail, i))
1780 return [], messages, blank_finish
1781 return block, messages, blank_finish
1782
1783 def isolate_simple_table(self):
1784 start = self.state_machine.line_offset
1785 lines = self.state_machine.input_lines
1786 limit = len(lines) - 1
1787 toplen = len(lines[start].strip())
1788 pattern_match = self.simple_table_border_pat.match
1789 found = 0
1790 found_at = None
1791 i = start + 1
1792 while i <= limit:
1793 line = lines[i]
1794 match = pattern_match(line)
1795 if match:
1796 if len(line.strip()) != toplen:
1797 self.state_machine.next_line(i - start)
1798 messages = self.malformed_table(
1799 lines[start:i+1], 'Bottom border or header rule does '
1800 'not match top border.', i-start)
1801 return [], messages, i == limit or not lines[i+1].strip()
1802 found += 1
1803 found_at = i
1804 if found == 2 or i == limit or not lines[i+1].strip():
1805 end = i
1806 break
1807 i += 1
1808 else: # reached end of input_lines
1809 details = 'No bottom table border found'
1810 if found:
1811 details += ' or no blank line after table bottom'
1812 self.state_machine.next_line(found_at - start)
1813 block = lines[start:found_at+1]
1814 else:
1815 self.state_machine.next_line(i - start - 1)
1816 block = lines[start:]
1817 messages = self.malformed_table(block, details + '.')
1818 return [], messages, not found
1819 self.state_machine.next_line(end - start)
1820 block = lines[start:end+1]
1821 # for East Asian chars:
1822 block.pad_double_width(self.double_width_pad_char)
1823 return block, [], end == limit or not lines[end+1].strip()
1824
1825 def malformed_table(self, block, detail='', offset=0):
1826 block.replace(self.double_width_pad_char, '')
1827 data = '\n'.join(block)
1828 message = 'Malformed table.'
1829 startline = self.state_machine.abs_line_number() - len(block) + 1
1830 if detail:
1831 message += '\n' + detail
1832 error = self.reporter.error(message, nodes.literal_block(data, data),
1833 line=startline+offset)
1834 return [error]
1835
1836 def build_table(self, tabledata, tableline, stub_columns=0, widths=None):
1837 colwidths, headrows, bodyrows = tabledata
1838 table = nodes.table()
1839 if widths == 'auto':
1840 table['classes'] += ['colwidths-auto']
1841 elif widths: # "grid" or list of integers
1842 table['classes'] += ['colwidths-given']
1843 tgroup = nodes.tgroup(cols=len(colwidths))
1844 table += tgroup
1845 for colwidth in colwidths:
1846 colspec = nodes.colspec(colwidth=colwidth)
1847 if stub_columns:
1848 colspec.attributes['stub'] = True
1849 stub_columns -= 1
1850 tgroup += colspec
1851 if headrows:
1852 thead = nodes.thead()
1853 tgroup += thead
1854 for row in headrows:
1855 thead += self.build_table_row(row, tableline)
1856 tbody = nodes.tbody()
1857 tgroup += tbody
1858 for row in bodyrows:
1859 tbody += self.build_table_row(row, tableline)
1860 return table
1861
1862 def build_table_row(self, rowdata, tableline):
1863 row = nodes.row()
1864 for cell in rowdata:
1865 if cell is None:
1866 continue
1867 morerows, morecols, offset, cellblock = cell
1868 attributes = {}
1869 if morerows:
1870 attributes['morerows'] = morerows
1871 if morecols:
1872 attributes['morecols'] = morecols
1873 entry = nodes.entry(**attributes)
1874 row += entry
1875 if ''.join(cellblock):
1876 self.nested_parse(cellblock, input_offset=tableline+offset,
1877 node=entry)
1878 return row
1879
1880 explicit = Struct()
1881 """Patterns and constants used for explicit markup recognition."""
1882
1883 explicit.patterns = Struct(
1884 target=re.compile(r"""
1885 (
1886 _ # anonymous target
1887 | # *OR*
1888 (?!_) # no underscore at the beginning
1889 (?P<quote>`?) # optional open quote
1890 (?![ `]) # first char. not space or
1891 # backquote
1892 (?P<name> # reference name
1893 .+?
1894 )
1895 %(non_whitespace_escape_before)s
1896 (?P=quote) # close quote if open quote used
1897 )
1898 (?<!(?<!\x00):) # no unescaped colon at end
1899 %(non_whitespace_escape_before)s
1900 [ ]? # optional space
1901 : # end of reference name
1902 ([ ]+|$) # followed by whitespace
1903 """ % vars(Inliner), re.VERBOSE),
1904 reference=re.compile(r"""
1905 (
1906 (?P<simple>%(simplename)s)_
1907 | # *OR*
1908 ` # open backquote
1909 (?![ ]) # not space
1910 (?P<phrase>.+?) # hyperlink phrase
1911 %(non_whitespace_escape_before)s
1912 `_ # close backquote,
1913 # reference mark
1914 )
1915 $ # end of string
1916 """ % vars(Inliner), re.VERBOSE),
1917 substitution=re.compile(r"""
1918 (
1919 (?![ ]) # first char. not space
1920 (?P<name>.+?) # substitution text
1921 %(non_whitespace_escape_before)s
1922 \| # close delimiter
1923 )
1924 ([ ]+|$) # followed by whitespace
1925 """ % vars(Inliner),
1926 re.VERBOSE),)
1927
1928 def footnote(self, match):
1929 src, srcline = self.state_machine.get_source_and_line()
1930 (indented, indent, offset, blank_finish
1931 ) = self.state_machine.get_first_known_indented(match.end())
1932 label = match.group(1)
1933 name = normalize_name(label)
1934 footnote = nodes.footnote('\n'.join(indented))
1935 footnote.source = src
1936 footnote.line = srcline
1937 if name[0] == '#': # auto-numbered
1938 name = name[1:] # autonumber label
1939 footnote['auto'] = 1
1940 if name:
1941 footnote['names'].append(name)
1942 self.document.note_autofootnote(footnote)
1943 elif name == '*': # auto-symbol
1944 name = ''
1945 footnote['auto'] = '*'
1946 self.document.note_symbol_footnote(footnote)
1947 else: # manually numbered
1948 footnote += nodes.label('', label)
1949 footnote['names'].append(name)
1950 self.document.note_footnote(footnote)
1951 if name:
1952 self.document.note_explicit_target(footnote, footnote)
1953 else:
1954 self.document.set_id(footnote, footnote)
1955 if indented:
1956 self.nested_parse(indented, input_offset=offset, node=footnote)
1957 else:
1958 footnote += self.reporter.warning('Footnote content expected.')
1959 return [footnote], blank_finish
1960
1961 def citation(self, match):
1962 src, srcline = self.state_machine.get_source_and_line()
1963 (indented, indent, offset, blank_finish
1964 ) = self.state_machine.get_first_known_indented(match.end())
1965 label = match.group(1)
1966 name = normalize_name(label)
1967 citation = nodes.citation('\n'.join(indented))
1968 citation.source = src
1969 citation.line = srcline
1970 citation += nodes.label('', label)
1971 citation['names'].append(name)
1972 self.document.note_citation(citation)
1973 self.document.note_explicit_target(citation, citation)
1974 if indented:
1975 self.nested_parse(indented, input_offset=offset, node=citation)
1976 else:
1977 citation += self.reporter.warning('Citation content expected.')
1978 return [citation], blank_finish
1979
1980 def hyperlink_target(self, match):
1981 pattern = self.explicit.patterns.target
1982 lineno = self.state_machine.abs_line_number()
1983 (block, indent, offset, blank_finish
1984 ) = self.state_machine.get_first_known_indented(
1985 match.end(), until_blank=True, strip_indent=False)
1986 blocktext = match.string[:match.end()] + '\n'.join(block)
1987 block = [escape2null(line) for line in block]
1988 escaped = block[0]
1989 blockindex = 0
1990 while True:
1991 targetmatch = pattern.match(escaped)
1992 if targetmatch:
1993 break
1994 blockindex += 1
1995 try:
1996 escaped += block[blockindex]
1997 except IndexError:
1998 raise MarkupError('malformed hyperlink target.')
1999 del block[:blockindex]
2000 block[0] = (block[0] + ' ')[targetmatch.end()-len(escaped)-1:].strip()
2001 target = self.make_target(block, blocktext, lineno,
2002 targetmatch.group('name'))
2003 return [target], blank_finish
2004
2005 def make_target(self, block, block_text, lineno, target_name):
2006 target_type, data = self.parse_target(block, block_text, lineno)
2007 if target_type == 'refname':
2008 target = nodes.target(block_text, '', refname=normalize_name(data))
2009 target.indirect_reference_name = data
2010 self.add_target(target_name, '', target, lineno)
2011 self.document.note_indirect_target(target)
2012 return target
2013 elif target_type == 'refuri':
2014 target = nodes.target(block_text, '')
2015 self.add_target(target_name, data, target, lineno)
2016 return target
2017 else:
2018 return data
2019
2020 def parse_target(self, block, block_text, lineno):
2021 """
2022 Determine the type of reference of a target.
2023
2024 :Return: A 2-tuple, one of:
2025
2026 - 'refname' and the indirect reference name
2027 - 'refuri' and the URI
2028 - 'malformed' and a system_message node
2029 """
2030 if block and block[-1].strip()[-1:] == '_': # possible indirect target
2031 reference = ' '.join(line.strip() for line in block)
2032 refname = self.is_reference(reference)
2033 if refname:
2034 return 'refname', refname
2035 ref_parts = split_escaped_whitespace(' '.join(block))
2036 reference = ' '.join(''.join(unescape(part).split())
2037 for part in ref_parts)
2038 return 'refuri', reference
2039
2040 def is_reference(self, reference):
2041 match = self.explicit.patterns.reference.match(
2042 whitespace_normalize_name(reference))
2043 if not match:
2044 return None
2045 return unescape(match.group('simple') or match.group('phrase'))
2046
2047 def add_target(self, targetname, refuri, target, lineno):
2048 target.line = lineno
2049 if targetname:
2050 name = normalize_name(unescape(targetname))
2051 target['names'].append(name)
2052 if refuri:
2053 uri = self.inliner.adjust_uri(refuri)
2054 if uri:
2055 target['refuri'] = uri
2056 else:
2057 raise ApplicationError('problem with URI: %r' % refuri)
2058 self.document.note_explicit_target(target, self.parent)
2059 else: # anonymous target
2060 if refuri:
2061 target['refuri'] = refuri
2062 target['anonymous'] = True
2063 self.document.note_anonymous_target(target)
2064
2065 def substitution_def(self, match):
2066 pattern = self.explicit.patterns.substitution
2067 src, srcline = self.state_machine.get_source_and_line()
2068 (block, indent, offset, blank_finish
2069 ) = self.state_machine.get_first_known_indented(match.end(),
2070 strip_indent=False)
2071 blocktext = (match.string[:match.end()] + '\n'.join(block))
2072 block.disconnect()
2073 escaped = escape2null(block[0].rstrip())
2074 blockindex = 0
2075 while True:
2076 subdefmatch = pattern.match(escaped)
2077 if subdefmatch:
2078 break
2079 blockindex += 1
2080 try:
2081 escaped = escaped + ' ' + escape2null(
2082 block[blockindex].strip())
2083 except IndexError:
2084 raise MarkupError('malformed substitution definition.')
2085 del block[:blockindex] # strip out the substitution marker
2086 start = subdefmatch.end()-len(escaped)-1
2087 block[0] = (block[0].strip() + ' ')[start:-1]
2088 if not block[0]:
2089 del block[0]
2090 offset += 1
2091 while block and not block[-1].strip():
2092 block.pop()
2093 subname = subdefmatch.group('name')
2094 substitution_node = nodes.substitution_definition(blocktext)
2095 substitution_node.source = src
2096 substitution_node.line = srcline
2097 if not block:
2098 msg = self.reporter.warning(
2099 'Substitution definition "%s" missing contents.' % subname,
2100 nodes.literal_block(blocktext, blocktext),
2101 source=src, line=srcline)
2102 return [msg], blank_finish
2103 block[0] = block[0].strip()
2104 substitution_node['names'].append(
2105 nodes.whitespace_normalize_name(subname))
2106 new_abs_offset, blank_finish = self.nested_list_parse(
2107 block, input_offset=offset, node=substitution_node,
2108 initial_state='SubstitutionDef', blank_finish=blank_finish)
2109 i = 0
2110 for node in substitution_node[:]:
2111 if not (isinstance(node, nodes.Inline)
2112 or isinstance(node, nodes.Text)):
2113 self.parent += substitution_node[i]
2114 del substitution_node[i]
2115 else:
2116 i += 1
2117 for node in substitution_node.findall(nodes.Element):
2118 if self.disallowed_inside_substitution_definitions(node):
2119 pformat = nodes.literal_block('', node.pformat().rstrip())
2120 msg = self.reporter.error(
2121 'Substitution definition contains illegal element <%s>:'
2122 % node.tagname,
2123 pformat, nodes.literal_block(blocktext, blocktext),
2124 source=src, line=srcline)
2125 return [msg], blank_finish
2126 if len(substitution_node) == 0:
2127 msg = self.reporter.warning(
2128 'Substitution definition "%s" empty or invalid.' % subname,
2129 nodes.literal_block(blocktext, blocktext),
2130 source=src, line=srcline)
2131 return [msg], blank_finish
2132 self.document.note_substitution_def(
2133 substitution_node, subname, self.parent)
2134 return [substitution_node], blank_finish
2135
2136 def disallowed_inside_substitution_definitions(self, node) -> bool:
2137 if (node['ids']
2138 or isinstance(node, nodes.reference) and node.get('anonymous')
2139 or isinstance(node, nodes.footnote_reference) and node.get('auto')): # noqa: E501
2140 return True
2141 else:
2142 return False
2143
2144 def directive(self, match, **option_presets):
2145 """Returns a 2-tuple: list of nodes, and a "blank finish" boolean."""
2146 type_name = match.group(1)
2147 directive_class, messages = directives.directive(
2148 type_name, self.memo.language, self.document)
2149 self.parent += messages
2150 if directive_class:
2151 return self.run_directive(
2152 directive_class, match, type_name, option_presets)
2153 else:
2154 return self.unknown_directive(type_name)
2155
2156 def run_directive(self, directive, match, type_name, option_presets):
2157 """
2158 Parse a directive then run its directive function.
2159
2160 Parameters:
2161
2162 - `directive`: The class implementing the directive. Must be
2163 a subclass of `rst.Directive`.
2164
2165 - `match`: A regular expression match object which matched the first
2166 line of the directive.
2167
2168 - `type_name`: The directive name, as used in the source text.
2169
2170 - `option_presets`: A dictionary of preset options, defaults for the
2171 directive options. Currently, only an "alt" option is passed by
2172 substitution definitions (value: the substitution name), which may
2173 be used by an embedded image directive.
2174
2175 Returns a 2-tuple: list of nodes, and a "blank finish" boolean.
2176 """
2177 if isinstance(directive, (FunctionType, MethodType)):
2178 from docutils.parsers.rst import convert_directive_function
2179 directive = convert_directive_function(directive)
2180 lineno = self.state_machine.abs_line_number()
2181 initial_line_offset = self.state_machine.line_offset
2182 (indented, indent, line_offset, blank_finish
2183 ) = self.state_machine.get_first_known_indented(match.end(),
2184 strip_top=0)
2185 block_text = '\n'.join(self.state_machine.input_lines[
2186 initial_line_offset : self.state_machine.line_offset + 1]) # noqa: E203,E501
2187 try:
2188 arguments, options, content, content_offset = (
2189 self.parse_directive_block(indented, line_offset,
2190 directive, option_presets))
2191 except MarkupError as detail:
2192 error = self.reporter.error(
2193 'Error in "%s" directive:\n%s.' % (type_name,
2194 ' '.join(detail.args)),
2195 nodes.literal_block(block_text, block_text), line=lineno)
2196 return [error], blank_finish
2197 directive_instance = directive(
2198 type_name, arguments, options, content, lineno,
2199 content_offset, block_text, self, self.state_machine)
2200 try:
2201 result = directive_instance.run()
2202 except docutils.parsers.rst.DirectiveError as error:
2203 msg_node = self.reporter.system_message(error.level, error.msg,
2204 line=lineno)
2205 msg_node += nodes.literal_block(block_text, block_text)
2206 result = [msg_node]
2207 assert isinstance(result, list), \
2208 'Directive "%s" must return a list of nodes.' % type_name
2209 for i in range(len(result)):
2210 assert isinstance(result[i], nodes.Node), \
2211 ('Directive "%s" returned non-Node object (index %s): %r'
2212 % (type_name, i, result[i]))
2213 return (result,
2214 blank_finish or self.state_machine.is_next_line_blank())
2215
2216 def parse_directive_block(self, indented, line_offset, directive,
2217 option_presets):
2218 option_spec = directive.option_spec
2219 has_content = directive.has_content
2220 if indented and not indented[0].strip():
2221 indented.trim_start()
2222 line_offset += 1
2223 while indented and not indented[-1].strip():
2224 indented.trim_end()
2225 if indented and (directive.required_arguments
2226 or directive.optional_arguments
2227 or option_spec):
2228 for i, line in enumerate(indented):
2229 if not line.strip():
2230 break
2231 else:
2232 i += 1
2233 arg_block = indented[:i]
2234 content = indented[i+1:]
2235 content_offset = line_offset + i + 1
2236 else:
2237 content = indented
2238 content_offset = line_offset
2239 arg_block = []
2240 if option_spec:
2241 options, arg_block = self.parse_directive_options(
2242 option_presets, option_spec, arg_block)
2243 else:
2244 options = {}
2245 if arg_block and not (directive.required_arguments
2246 or directive.optional_arguments):
2247 content = arg_block + indented[i:]
2248 content_offset = line_offset
2249 arg_block = []
2250 while content and not content[0].strip():
2251 content.trim_start()
2252 content_offset += 1
2253 if directive.required_arguments or directive.optional_arguments:
2254 arguments = self.parse_directive_arguments(
2255 directive, arg_block)
2256 else:
2257 arguments = []
2258 if content and not has_content:
2259 raise MarkupError('no content permitted')
2260 return arguments, options, content, content_offset
2261
2262 def parse_directive_options(self, option_presets, option_spec, arg_block):
2263 options = option_presets.copy()
2264 for i, line in enumerate(arg_block):
2265 if re.match(Body.patterns['field_marker'], line):
2266 opt_block = arg_block[i:]
2267 arg_block = arg_block[:i]
2268 break
2269 else:
2270 opt_block = []
2271 if opt_block:
2272 success, data = self.parse_extension_options(option_spec,
2273 opt_block)
2274 if success: # data is a dict of options
2275 options.update(data)
2276 else: # data is an error string
2277 raise MarkupError(data)
2278 return options, arg_block
2279
2280 def parse_directive_arguments(self, directive, arg_block):
2281 required = directive.required_arguments
2282 optional = directive.optional_arguments
2283 arg_text = '\n'.join(arg_block)
2284 arguments = arg_text.split()
2285 if len(arguments) < required:
2286 raise MarkupError('%s argument(s) required, %s supplied'
2287 % (required, len(arguments)))
2288 elif len(arguments) > required + optional:
2289 if directive.final_argument_whitespace:
2290 arguments = arg_text.split(None, required + optional - 1)
2291 else:
2292 raise MarkupError(
2293 'maximum %s argument(s) allowed, %s supplied'
2294 % (required + optional, len(arguments)))
2295 return arguments
2296
2297 def parse_extension_options(self, option_spec, datalines):
2298 """
2299 Parse `datalines` for a field list containing extension options
2300 matching `option_spec`.
2301
2302 :Parameters:
2303 - `option_spec`: a mapping of option name to conversion
2304 function, which should raise an exception on bad input.
2305 - `datalines`: a list of input strings.
2306
2307 :Return:
2308 - Success value, 1 or 0.
2309 - An option dictionary on success, an error string on failure.
2310 """
2311 node = nodes.field_list()
2312 newline_offset, blank_finish = self.nested_list_parse(
2313 datalines, 0, node, initial_state='ExtensionOptions',
2314 blank_finish=True)
2315 if newline_offset != len(datalines): # incomplete parse of block
2316 return 0, 'invalid option block'
2317 try:
2318 options = utils.extract_extension_options(node, option_spec)
2319 except KeyError as detail:
2320 return 0, 'unknown option: "%s"' % detail.args[0]
2321 except (ValueError, TypeError) as detail:
2322 return 0, 'invalid option value: %s' % ' '.join(detail.args)
2323 except utils.ExtensionOptionError as detail:
2324 return 0, 'invalid option data: %s' % ' '.join(detail.args)
2325 if blank_finish:
2326 return 1, options
2327 else:
2328 return 0, 'option data incompletely parsed'
2329
2330 def unknown_directive(self, type_name):
2331 lineno = self.state_machine.abs_line_number()
2332 (indented, indent, offset, blank_finish
2333 ) = self.state_machine.get_first_known_indented(0, strip_indent=False)
2334 text = '\n'.join(indented)
2335 error = self.reporter.error('Unknown directive type "%s".' % type_name,
2336 nodes.literal_block(text, text),
2337 line=lineno)
2338 return [error], blank_finish
2339
2340 def comment(self, match):
2341 if self.state_machine.is_next_line_blank():
2342 first_comment_line = match.string[match.end():]
2343 if not first_comment_line.strip(): # empty comment
2344 return [nodes.comment()], True # "A tiny but practical wart."
2345 if first_comment_line.startswith('end of inclusion from "'):
2346 # cf. parsers.rst.directives.misc.Include
2347 self.document.include_log.pop()
2348 return [], True
2349 (indented, indent, offset, blank_finish
2350 ) = self.state_machine.get_first_known_indented(match.end())
2351 while indented and not indented[-1].strip():
2352 indented.trim_end()
2353 text = '\n'.join(indented)
2354 return [nodes.comment(text, text)], blank_finish
2355
2356 explicit.constructs = [
2357 (footnote,
2358 re.compile(r"""
2359 \.\.[ ]+ # explicit markup start
2360 \[
2361 ( # footnote label:
2362 [0-9]+ # manually numbered footnote
2363 | # *OR*
2364 \# # anonymous auto-numbered footnote
2365 | # *OR*
2366 \#%s # auto-number ed?) footnote label
2367 | # *OR*
2368 \* # auto-symbol footnote
2369 )
2370 \]
2371 ([ ]+|$) # whitespace or end of line
2372 """ % Inliner.simplename, re.VERBOSE)),
2373 (citation,
2374 re.compile(r"""
2375 \.\.[ ]+ # explicit markup start
2376 \[(%s)\] # citation label
2377 ([ ]+|$) # whitespace or end of line
2378 """ % Inliner.simplename, re.VERBOSE)),
2379 (hyperlink_target,
2380 re.compile(r"""
2381 \.\.[ ]+ # explicit markup start
2382 _ # target indicator
2383 (?![ ]|$) # first char. not space or EOL
2384 """, re.VERBOSE)),
2385 (substitution_def,
2386 re.compile(r"""
2387 \.\.[ ]+ # explicit markup start
2388 \| # substitution indicator
2389 (?![ ]|$) # first char. not space or EOL
2390 """, re.VERBOSE)),
2391 (directive,
2392 re.compile(r"""
2393 \.\.[ ]+ # explicit markup start
2394 (%s) # directive name
2395 [ ]? # optional space
2396 :: # directive delimiter
2397 ([ ]+|$) # whitespace or end of line
2398 """ % Inliner.simplename, re.VERBOSE))]
2399
2400 def explicit_markup(self, match, context, next_state):
2401 """Footnotes, hyperlink targets, directives, comments."""
2402 nodelist, blank_finish = self.explicit_construct(match)
2403 self.parent += nodelist
2404 self.explicit_list(blank_finish)
2405 return [], next_state, []
2406
2407 def explicit_construct(self, match):
2408 """Determine which explicit construct this is, parse & return it."""
2409 errors = []
2410 for method, pattern in self.explicit.constructs:
2411 expmatch = pattern.match(match.string)
2412 if expmatch:
2413 try:
2414 return method(self, expmatch)
2415 except MarkupError as error:
2416 lineno = self.state_machine.abs_line_number()
2417 message = ' '.join(error.args)
2418 errors.append(self.reporter.warning(message, line=lineno))
2419 break
2420 nodelist, blank_finish = self.comment(match)
2421 return nodelist + errors, blank_finish
2422
2423 def explicit_list(self, blank_finish) -> None:
2424 """
2425 Create a nested state machine for a series of explicit markup
2426 constructs (including anonymous hyperlink targets).
2427 """
2428 offset = self.state_machine.line_offset + 1 # next line
2429 newline_offset, blank_finish = self.nested_list_parse(
2430 self.state_machine.input_lines[offset:],
2431 input_offset=self.state_machine.abs_line_offset() + 1,
2432 node=self.parent, initial_state='Explicit',
2433 blank_finish=blank_finish,
2434 match_titles=self.state_machine.match_titles)
2435 self.goto_line(newline_offset)
2436 if not blank_finish:
2437 self.parent += self.unindent_warning('Explicit markup')
2438
2439 def anonymous(self, match, context, next_state):
2440 """Anonymous hyperlink targets."""
2441 nodelist, blank_finish = self.anonymous_target(match)
2442 self.parent += nodelist
2443 self.explicit_list(blank_finish)
2444 return [], next_state, []
2445
2446 def anonymous_target(self, match):
2447 lineno = self.state_machine.abs_line_number()
2448 (block, indent, offset, blank_finish
2449 ) = self.state_machine.get_first_known_indented(match.end(),
2450 until_blank=True)
2451 blocktext = match.string[:match.end()] + '\n'.join(block)
2452 block = [escape2null(line) for line in block]
2453 target = self.make_target(block, blocktext, lineno, '')
2454 return [target], blank_finish
2455
2456 def line(self, match, context, next_state):
2457 """Section title overline or transition marker."""
2458 if self.state_machine.match_titles:
2459 return [match.string], 'Line', []
2460 elif match.string.strip() == '::':
2461 raise statemachine.TransitionCorrection('text')
2462 elif len(match.string.strip()) < 4:
2463 msg = self.reporter.info(
2464 'Unexpected possible title overline or transition.\n'
2465 "Treating it as ordinary text because it's so short.",
2466 line=self.state_machine.abs_line_number())
2467 self.parent += msg
2468 raise statemachine.TransitionCorrection('text')
2469 else:
2470 blocktext = self.state_machine.line
2471 msg = self.reporter.error(
2472 'Unexpected section title or transition.',
2473 nodes.literal_block(blocktext, blocktext),
2474 line=self.state_machine.abs_line_number())
2475 self.parent += msg
2476 return [], next_state, []
2477
2478 def text(self, match, context, next_state):
2479 """Titles, definition lists, paragraphs."""
2480 return [match.string], 'Text', []
2481
2482
2483class RFC2822Body(Body):
2484
2485 """
2486 RFC2822 headers are only valid as the first constructs in documents. As
2487 soon as anything else appears, the `Body` state should take over.
2488 """
2489
2490 patterns = Body.patterns.copy() # can't modify the original
2491 patterns['rfc2822'] = r'[!-9;-~]+:( +|$)'
2492 initial_transitions = [(name, 'Body')
2493 for name in Body.initial_transitions]
2494 initial_transitions.insert(-1, ('rfc2822', 'Body')) # just before 'text'
2495
2496 def rfc2822(self, match, context, next_state):
2497 """RFC2822-style field list item."""
2498 fieldlist = nodes.field_list(classes=['rfc2822'])
2499 self.parent += fieldlist
2500 field, blank_finish = self.rfc2822_field(match)
2501 fieldlist += field
2502 offset = self.state_machine.line_offset + 1 # next line
2503 newline_offset, blank_finish = self.nested_list_parse(
2504 self.state_machine.input_lines[offset:],
2505 input_offset=self.state_machine.abs_line_offset() + 1,
2506 node=fieldlist, initial_state='RFC2822List',
2507 blank_finish=blank_finish)
2508 self.goto_line(newline_offset)
2509 if not blank_finish:
2510 self.parent += self.unindent_warning(
2511 'RFC2822-style field list')
2512 return [], next_state, []
2513
2514 def rfc2822_field(self, match):
2515 name = match.string[:match.string.find(':')]
2516 (indented, indent, line_offset, blank_finish
2517 ) = self.state_machine.get_first_known_indented(match.end(),
2518 until_blank=True)
2519 fieldnode = nodes.field()
2520 fieldnode += nodes.field_name(name, name)
2521 fieldbody = nodes.field_body('\n'.join(indented))
2522 fieldnode += fieldbody
2523 if indented:
2524 self.nested_parse(indented, input_offset=line_offset,
2525 node=fieldbody)
2526 return fieldnode, blank_finish
2527
2528
2529class SpecializedBody(Body):
2530
2531 """
2532 Superclass for second and subsequent compound element members. Compound
2533 elements are lists and list-like constructs.
2534
2535 All transition methods are disabled (redefined as `invalid_input`).
2536 Override individual methods in subclasses to re-enable.
2537
2538 For example, once an initial bullet list item, say, is recognized, the
2539 `BulletList` subclass takes over, with a "bullet_list" node as its
2540 container. Upon encountering the initial bullet list item, `Body.bullet`
2541 calls its ``self.nested_list_parse`` (`RSTState.nested_list_parse`), which
2542 starts up a nested parsing session with `BulletList` as the initial state.
2543 Only the ``bullet`` transition method is enabled in `BulletList`; as long
2544 as only bullet list items are encountered, they are parsed and inserted
2545 into the container. The first construct which is *not* a bullet list item
2546 triggers the `invalid_input` method, which ends the nested parse and
2547 closes the container. `BulletList` needs to recognize input that is
2548 invalid in the context of a bullet list, which means everything *other
2549 than* bullet list items, so it inherits the transition list created in
2550 `Body`.
2551 """
2552
2553 def invalid_input(self, match=None, context=None, next_state=None):
2554 """Not a compound element member. Abort this state machine."""
2555 self.state_machine.previous_line() # back up so parent SM can reassess
2556 raise EOFError
2557
2558 indent = invalid_input
2559 bullet = invalid_input
2560 enumerator = invalid_input
2561 field_marker = invalid_input
2562 option_marker = invalid_input
2563 doctest = invalid_input
2564 line_block = invalid_input
2565 grid_table_top = invalid_input
2566 simple_table_top = invalid_input
2567 explicit_markup = invalid_input
2568 anonymous = invalid_input
2569 line = invalid_input
2570 text = invalid_input
2571
2572
2573class BulletList(SpecializedBody):
2574
2575 """Second and subsequent bullet_list list_items."""
2576
2577 def bullet(self, match, context, next_state):
2578 """Bullet list item."""
2579 if match.string[0] != self.parent['bullet']:
2580 # different bullet: new list
2581 self.invalid_input()
2582 listitem, blank_finish = self.list_item(match.end())
2583 self.parent += listitem
2584 self.blank_finish = blank_finish
2585 return [], next_state, []
2586
2587
2588class DefinitionList(SpecializedBody):
2589
2590 """Second and subsequent definition_list_items."""
2591
2592 def text(self, match, context, next_state):
2593 """Definition lists."""
2594 return [match.string], 'Definition', []
2595
2596
2597class EnumeratedList(SpecializedBody):
2598
2599 """Second and subsequent enumerated_list list_items."""
2600
2601 def enumerator(self, match, context, next_state):
2602 """Enumerated list item."""
2603 format, sequence, text, ordinal = self.parse_enumerator(
2604 match, self.parent['enumtype'])
2605 if (format != self.format
2606 or (sequence != '#' and (sequence != self.parent['enumtype']
2607 or self.auto
2608 or ordinal != (self.lastordinal + 1)))
2609 or not self.is_enumerated_list_item(ordinal, sequence, format)):
2610 # different enumeration: new list
2611 self.invalid_input()
2612 if sequence == '#':
2613 self.auto = 1
2614 listitem, blank_finish = self.list_item(match.end())
2615 self.parent += listitem
2616 self.blank_finish = blank_finish
2617 self.lastordinal = ordinal
2618 return [], next_state, []
2619
2620
2621class FieldList(SpecializedBody):
2622
2623 """Second and subsequent field_list fields."""
2624
2625 def field_marker(self, match, context, next_state):
2626 """Field list field."""
2627 field, blank_finish = self.field(match)
2628 self.parent += field
2629 self.blank_finish = blank_finish
2630 return [], next_state, []
2631
2632
2633class OptionList(SpecializedBody):
2634
2635 """Second and subsequent option_list option_list_items."""
2636
2637 def option_marker(self, match, context, next_state):
2638 """Option list item."""
2639 try:
2640 option_list_item, blank_finish = self.option_list_item(match)
2641 except MarkupError:
2642 self.invalid_input()
2643 self.parent += option_list_item
2644 self.blank_finish = blank_finish
2645 return [], next_state, []
2646
2647
2648class RFC2822List(SpecializedBody, RFC2822Body):
2649
2650 """Second and subsequent RFC2822-style field_list fields."""
2651
2652 patterns = RFC2822Body.patterns
2653 initial_transitions = RFC2822Body.initial_transitions
2654
2655 def rfc2822(self, match, context, next_state):
2656 """RFC2822-style field list item."""
2657 field, blank_finish = self.rfc2822_field(match)
2658 self.parent += field
2659 self.blank_finish = blank_finish
2660 return [], 'RFC2822List', []
2661
2662 blank = SpecializedBody.invalid_input
2663
2664
2665class ExtensionOptions(FieldList):
2666
2667 """
2668 Parse field_list fields for extension options.
2669
2670 No nested parsing is done (including inline markup parsing).
2671 """
2672
2673 def parse_field_body(self, indented, offset, node) -> None:
2674 """Override `Body.parse_field_body` for simpler parsing."""
2675 lines = []
2676 for line in list(indented) + ['']:
2677 if line.strip():
2678 lines.append(line)
2679 elif lines:
2680 text = '\n'.join(lines)
2681 node += nodes.paragraph(text, text)
2682 lines = []
2683
2684
2685class LineBlock(SpecializedBody):
2686
2687 """Second and subsequent lines of a line_block."""
2688
2689 blank = SpecializedBody.invalid_input
2690
2691 def line_block(self, match, context, next_state):
2692 """New line of line block."""
2693 lineno = self.state_machine.abs_line_number()
2694 line, messages, blank_finish = self.line_block_line(match, lineno)
2695 self.parent += line
2696 self.parent.parent += messages
2697 self.blank_finish = blank_finish
2698 return [], next_state, []
2699
2700
2701class Explicit(SpecializedBody):
2702
2703 """Second and subsequent explicit markup construct."""
2704
2705 def explicit_markup(self, match, context, next_state):
2706 """Footnotes, hyperlink targets, directives, comments."""
2707 nodelist, blank_finish = self.explicit_construct(match)
2708 self.parent += nodelist
2709 self.blank_finish = blank_finish
2710 return [], next_state, []
2711
2712 def anonymous(self, match, context, next_state):
2713 """Anonymous hyperlink targets."""
2714 nodelist, blank_finish = self.anonymous_target(match)
2715 self.parent += nodelist
2716 self.blank_finish = blank_finish
2717 return [], next_state, []
2718
2719 blank = SpecializedBody.invalid_input
2720
2721
2722class SubstitutionDef(Body):
2723
2724 """
2725 Parser for the contents of a substitution_definition element.
2726 """
2727
2728 patterns = {
2729 'embedded_directive': re.compile(r'(%s)::( +|$)'
2730 % Inliner.simplename),
2731 'text': r''}
2732 initial_transitions = ['embedded_directive', 'text']
2733
2734 def embedded_directive(self, match, context, next_state):
2735 nodelist, blank_finish = self.directive(match,
2736 alt=self.parent['names'][0])
2737 self.parent += nodelist
2738 if not self.state_machine.at_eof():
2739 self.blank_finish = blank_finish
2740 raise EOFError
2741
2742 def text(self, match, context, next_state):
2743 if not self.state_machine.at_eof():
2744 self.blank_finish = self.state_machine.is_next_line_blank()
2745 raise EOFError
2746
2747
2748class Text(RSTState):
2749
2750 """
2751 Classifier of second line of a text block.
2752
2753 Could be a paragraph, a definition list item, or a title.
2754 """
2755
2756 patterns = {'underline': Body.patterns['line'],
2757 'text': r''}
2758 initial_transitions = [('underline', 'Body'), ('text', 'Body')]
2759
2760 def blank(self, match, context, next_state):
2761 """End of paragraph."""
2762 # NOTE: self.paragraph returns [node, system_message(s)], literalnext
2763 paragraph, literalnext = self.paragraph(
2764 context, self.state_machine.abs_line_number() - 1)
2765 self.parent += paragraph
2766 if literalnext:
2767 self.parent += self.literal_block()
2768 return [], 'Body', []
2769
2770 def eof(self, context):
2771 if context:
2772 self.blank(None, context, None)
2773 return []
2774
2775 def indent(self, match, context, next_state):
2776 """Definition list item."""
2777 dl = nodes.definition_list()
2778 # the definition list starts on the line before the indent:
2779 lineno = self.state_machine.abs_line_number() - 1
2780 dl.source, dl.line = self.state_machine.get_source_and_line(lineno)
2781 dl_item, blank_finish = self.definition_list_item(context)
2782 dl += dl_item
2783 self.parent += dl
2784 offset = self.state_machine.line_offset + 1 # next line
2785 newline_offset, blank_finish = self.nested_list_parse(
2786 self.state_machine.input_lines[offset:],
2787 input_offset=self.state_machine.abs_line_offset() + 1,
2788 node=dl, initial_state='DefinitionList',
2789 blank_finish=blank_finish, blank_finish_state='Definition')
2790 self.goto_line(newline_offset)
2791 if not blank_finish:
2792 self.parent += self.unindent_warning('Definition list')
2793 return [], 'Body', []
2794
2795 def underline(self, match, context, next_state):
2796 """Section title."""
2797 lineno = self.state_machine.abs_line_number()
2798 title = context[0].rstrip()
2799 underline = match.string.rstrip()
2800 source = title + '\n' + underline
2801 messages = []
2802 if column_width(title) > len(underline):
2803 if len(underline) < 4:
2804 if self.state_machine.match_titles:
2805 msg = self.reporter.info(
2806 'Possible title underline, too short for the title.\n'
2807 "Treating it as ordinary text because it's so short.",
2808 line=lineno)
2809 self.parent += msg
2810 raise statemachine.TransitionCorrection('text')
2811 else:
2812 blocktext = context[0] + '\n' + self.state_machine.line
2813 msg = self.reporter.warning(
2814 'Title underline too short.',
2815 nodes.literal_block(blocktext, blocktext),
2816 line=lineno)
2817 messages.append(msg)
2818 if not self.state_machine.match_titles:
2819 blocktext = context[0] + '\n' + self.state_machine.line
2820 # We need get_source_and_line() here to report correctly
2821 src, srcline = self.state_machine.get_source_and_line()
2822 # TODO: why is abs_line_number() == srcline+1
2823 # if the error is in a table (try with test_tables.py)?
2824 # print("get_source_and_line", srcline)
2825 # print("abs_line_number", self.state_machine.abs_line_number())
2826 msg = self.reporter.error(
2827 'Unexpected section title.',
2828 nodes.literal_block(blocktext, blocktext),
2829 source=src, line=srcline)
2830 self.parent += messages
2831 self.parent += msg
2832 return [], next_state, []
2833 style = underline[0]
2834 context[:] = []
2835 self.section(title, source, style, lineno - 1, messages)
2836 return [], next_state, []
2837
2838 def text(self, match, context, next_state):
2839 """Paragraph."""
2840 startline = self.state_machine.abs_line_number() - 1
2841 msg = None
2842 try:
2843 block = self.state_machine.get_text_block(flush_left=True)
2844 except statemachine.UnexpectedIndentationError as err:
2845 block, src, srcline = err.args
2846 msg = self.reporter.error('Unexpected indentation.',
2847 source=src, line=srcline)
2848 lines = context + list(block)
2849 paragraph, literalnext = self.paragraph(lines, startline)
2850 self.parent += paragraph
2851 self.parent += msg
2852 if literalnext:
2853 try:
2854 self.state_machine.next_line()
2855 except EOFError:
2856 pass
2857 self.parent += self.literal_block()
2858 return [], next_state, []
2859
2860 def literal_block(self):
2861 """Return a list of nodes."""
2862 (indented, indent, offset, blank_finish
2863 ) = self.state_machine.get_indented()
2864 while indented and not indented[-1].strip():
2865 indented.trim_end()
2866 if not indented:
2867 return self.quoted_literal_block()
2868 data = '\n'.join(indented)
2869 literal_block = nodes.literal_block(data, data)
2870 (literal_block.source,
2871 literal_block.line) = self.state_machine.get_source_and_line(offset+1)
2872 nodelist = [literal_block]
2873 if not blank_finish:
2874 nodelist.append(self.unindent_warning('Literal block'))
2875 return nodelist
2876
2877 def quoted_literal_block(self):
2878 abs_line_offset = self.state_machine.abs_line_offset()
2879 offset = self.state_machine.line_offset
2880 parent_node = nodes.Element()
2881 new_abs_offset = self.nested_parse(
2882 self.state_machine.input_lines[offset:],
2883 input_offset=abs_line_offset, node=parent_node, match_titles=False,
2884 state_machine_kwargs={'state_classes': (QuotedLiteralBlock,),
2885 'initial_state': 'QuotedLiteralBlock'})
2886 self.goto_line(new_abs_offset)
2887 return parent_node.children
2888
2889 def definition_list_item(self, termline):
2890 # the parser is already on the second (indented) line:
2891 dd_lineno = self.state_machine.abs_line_number()
2892 dt_lineno = dd_lineno - 1
2893 (indented, indent, line_offset, blank_finish
2894 ) = self.state_machine.get_indented()
2895 dl_item = nodes.definition_list_item(
2896 '\n'.join(termline + list(indented)))
2897 (dl_item.source,
2898 dl_item.line) = self.state_machine.get_source_and_line(dt_lineno)
2899 dt_nodes, messages = self.term(termline, dt_lineno)
2900 dl_item += dt_nodes
2901 dd = nodes.definition('', *messages)
2902 dd.source, dd.line = self.state_machine.get_source_and_line(dd_lineno)
2903 dl_item += dd
2904 if termline[0][-2:] == '::':
2905 dd += self.reporter.info(
2906 'Blank line missing before literal block (after the "::")? '
2907 'Interpreted as a definition list item.',
2908 line=dd_lineno)
2909 # TODO: drop a definition if it is an empty comment to allow
2910 # definition list items with several terms?
2911 # https://sourceforge.net/p/docutils/feature-requests/60/
2912 self.nested_parse(indented, input_offset=line_offset, node=dd)
2913 return dl_item, blank_finish
2914
2915 classifier_delimiter = re.compile(' +: +')
2916
2917 def term(self, lines, lineno):
2918 """Return a definition_list's term and optional classifiers."""
2919 assert len(lines) == 1
2920 text_nodes, messages = self.inline_text(lines[0], lineno)
2921 dt = nodes.term(lines[0])
2922 dt.source, dt.line = self.state_machine.get_source_and_line(lineno)
2923 node_list = [dt]
2924 for i in range(len(text_nodes)):
2925 node = text_nodes[i]
2926 if isinstance(node, nodes.Text):
2927 parts = self.classifier_delimiter.split(node)
2928 if len(parts) == 1:
2929 node_list[-1] += node
2930 else:
2931 text = parts[0].rstrip()
2932 textnode = nodes.Text(text)
2933 node_list[-1] += textnode
2934 node_list += [nodes.classifier(unescape(part, True), part)
2935 for part in parts[1:]]
2936 else:
2937 node_list[-1] += node
2938 return node_list, messages
2939
2940
2941class SpecializedText(Text):
2942
2943 """
2944 Superclass for second and subsequent lines of Text-variants.
2945
2946 All transition methods are disabled. Override individual methods in
2947 subclasses to re-enable.
2948 """
2949
2950 def eof(self, context):
2951 """Incomplete construct."""
2952 return []
2953
2954 def invalid_input(self, match=None, context=None, next_state=None):
2955 """Not a compound element member. Abort this state machine."""
2956 raise EOFError
2957
2958 blank = invalid_input
2959 indent = invalid_input
2960 underline = invalid_input
2961 text = invalid_input
2962
2963
2964class Definition(SpecializedText):
2965
2966 """Second line of potential definition_list_item."""
2967
2968 def eof(self, context):
2969 """Not a definition."""
2970 self.state_machine.previous_line(2) # so parent SM can reassess
2971 return []
2972
2973 def indent(self, match, context, next_state):
2974 """Definition list item."""
2975 dl_item, blank_finish = self.definition_list_item(context)
2976 self.parent += dl_item
2977 self.blank_finish = blank_finish
2978 return [], 'DefinitionList', []
2979
2980
2981class Line(SpecializedText):
2982
2983 """
2984 Second line of over- & underlined section title or transition marker.
2985 """
2986
2987 eofcheck = 1 # ignored, will be removed in Docutils 2.0.
2988
2989 def eof(self, context):
2990 """Transition marker at end of section or document."""
2991 marker = context[0].strip()
2992 if len(marker) < 4:
2993 self.state_correction(context)
2994 src, srcline = self.state_machine.get_source_and_line()
2995 # lineno = self.state_machine.abs_line_number() - 1
2996 transition = nodes.transition(rawsource=context[0])
2997 transition.source = src
2998 transition.line = srcline - 1
2999 # transition.line = lineno
3000 self.parent += transition
3001 return []
3002
3003 def blank(self, match, context, next_state):
3004 """Transition marker."""
3005 src, srcline = self.state_machine.get_source_and_line()
3006 marker = context[0].strip()
3007 if len(marker) < 4:
3008 self.state_correction(context)
3009 transition = nodes.transition(rawsource=marker)
3010 transition.source = src
3011 transition.line = srcline - 1
3012 self.parent += transition
3013 return [], 'Body', []
3014
3015 def text(self, match, context, next_state):
3016 """Potential over- & underlined title."""
3017 lineno = self.state_machine.abs_line_number() - 1
3018 overline = context[0]
3019 title = match.string
3020 underline = ''
3021 try:
3022 underline = self.state_machine.next_line()
3023 except EOFError:
3024 blocktext = overline + '\n' + title
3025 if len(overline.rstrip()) < 4:
3026 self.short_overline(context, blocktext, lineno, 2)
3027 else:
3028 msg = self.reporter.error(
3029 'Incomplete section title.',
3030 nodes.literal_block(blocktext, blocktext),
3031 line=lineno)
3032 self.parent += msg
3033 return [], 'Body', []
3034 source = '%s\n%s\n%s' % (overline, title, underline)
3035 overline = overline.rstrip()
3036 underline = underline.rstrip()
3037 if not self.transitions['underline'][0].match(underline):
3038 blocktext = overline + '\n' + title + '\n' + underline
3039 if len(overline.rstrip()) < 4:
3040 self.short_overline(context, blocktext, lineno, 2)
3041 else:
3042 msg = self.reporter.error(
3043 'Missing matching underline for section title overline.',
3044 nodes.literal_block(source, source),
3045 line=lineno)
3046 self.parent += msg
3047 return [], 'Body', []
3048 elif overline != underline:
3049 blocktext = overline + '\n' + title + '\n' + underline
3050 if len(overline.rstrip()) < 4:
3051 self.short_overline(context, blocktext, lineno, 2)
3052 else:
3053 msg = self.reporter.error(
3054 'Title overline & underline mismatch.',
3055 nodes.literal_block(source, source),
3056 line=lineno)
3057 self.parent += msg
3058 return [], 'Body', []
3059 title = title.rstrip()
3060 messages = []
3061 if column_width(title) > len(overline):
3062 blocktext = overline + '\n' + title + '\n' + underline
3063 if len(overline.rstrip()) < 4:
3064 self.short_overline(context, blocktext, lineno, 2)
3065 else:
3066 msg = self.reporter.warning(
3067 'Title overline too short.',
3068 nodes.literal_block(source, source),
3069 line=lineno)
3070 messages.append(msg)
3071 style = (overline[0], underline[0])
3072 self.section(title.lstrip(), source, style, lineno + 1, messages)
3073 return [], 'Body', []
3074
3075 indent = text # indented title
3076
3077 def underline(self, match, context, next_state):
3078 overline = context[0]
3079 blocktext = overline + '\n' + self.state_machine.line
3080 lineno = self.state_machine.abs_line_number() - 1
3081 if len(overline.rstrip()) < 4:
3082 self.short_overline(context, blocktext, lineno, 1)
3083 msg = self.reporter.error(
3084 'Invalid section title or transition marker.',
3085 nodes.literal_block(blocktext, blocktext),
3086 line=lineno)
3087 self.parent += msg
3088 return [], 'Body', []
3089
3090 def short_overline(self, context, blocktext, lineno, lines=1) -> None:
3091 msg = self.reporter.info(
3092 'Possible incomplete section title.\nTreating the overline as '
3093 "ordinary text because it's so short.",
3094 line=lineno)
3095 self.parent += msg
3096 self.state_correction(context, lines)
3097
3098 def state_correction(self, context, lines=1):
3099 self.state_machine.previous_line(lines)
3100 context[:] = []
3101 raise statemachine.StateCorrection('Body', 'text')
3102
3103
3104class QuotedLiteralBlock(RSTState):
3105
3106 """
3107 Nested parse handler for quoted (unindented) literal blocks.
3108
3109 Special-purpose. Not for inclusion in `state_classes`.
3110 """
3111
3112 patterns = {'initial_quoted': r'(%(nonalphanum7bit)s)' % Body.pats,
3113 'text': r''}
3114 initial_transitions = ('initial_quoted', 'text')
3115
3116 def __init__(self, state_machine, debug=False) -> None:
3117 RSTState.__init__(self, state_machine, debug)
3118 self.messages = []
3119 self.initial_lineno = None
3120
3121 def blank(self, match, context, next_state):
3122 if context:
3123 raise EOFError
3124 else:
3125 return context, next_state, []
3126
3127 def eof(self, context):
3128 if context:
3129 src, srcline = self.state_machine.get_source_and_line(
3130 self.initial_lineno)
3131 text = '\n'.join(context)
3132 literal_block = nodes.literal_block(text, text)
3133 literal_block.source = src
3134 literal_block.line = srcline
3135 self.parent += literal_block
3136 else:
3137 self.parent += self.reporter.warning(
3138 'Literal block expected; none found.',
3139 line=self.state_machine.abs_line_number()
3140 ) # src not available, statemachine.input_lines is empty
3141 self.state_machine.previous_line()
3142 self.parent += self.messages
3143 return []
3144
3145 def indent(self, match, context, next_state):
3146 assert context, ('QuotedLiteralBlock.indent: context should not '
3147 'be empty!')
3148 self.messages.append(
3149 self.reporter.error('Unexpected indentation.',
3150 line=self.state_machine.abs_line_number()))
3151 self.state_machine.previous_line()
3152 raise EOFError
3153
3154 def initial_quoted(self, match, context, next_state):
3155 """Match arbitrary quote character on the first line only."""
3156 self.remove_transition('initial_quoted')
3157 quote = match.string[0]
3158 pattern = re.compile(re.escape(quote))
3159 # New transition matches consistent quotes only:
3160 self.add_transition('quoted',
3161 (pattern, self.quoted, self.__class__.__name__))
3162 self.initial_lineno = self.state_machine.abs_line_number()
3163 return [match.string], next_state, []
3164
3165 def quoted(self, match, context, next_state):
3166 """Match consistent quotes on subsequent lines."""
3167 context.append(match.string)
3168 return context, next_state, []
3169
3170 def text(self, match, context, next_state):
3171 if context:
3172 self.messages.append(
3173 self.reporter.error('Inconsistent literal block quoting.',
3174 line=self.state_machine.abs_line_number()))
3175 self.state_machine.previous_line()
3176 raise EOFError
3177
3178
3179state_classes = (Body, BulletList, DefinitionList, EnumeratedList, FieldList,
3180 OptionList, LineBlock, ExtensionOptions, Explicit, Text,
3181 Definition, Line, SubstitutionDef, RFC2822Body, RFC2822List)
3182"""Standard set of State classes used to start `RSTStateMachine`."""