1# $Id: states.py 10351 2026-06-11 21:51:21Z milde $
2# Author: David Goodger <goodger@python.org>
3# Copyright: This module has been placed in the public domain.
4
5"""
6This is the ``docutils.parsers.rst.states`` module, the core of
7the reStructuredText parser. It defines the following:
8
9:Classes:
10 - `RSTStateMachine`: reStructuredText parser's entry point.
11 - `NestedStateMachine`: recursive StateMachine.
12 - `RSTState`: reStructuredText State superclass.
13 - `Inliner`: For parsing inline markup.
14 - `Body`: Generic classifier of the first line of a block.
15 - `SpecializedBody`: Superclass for compound element members.
16 - `BulletList`: Second and subsequent bullet_list list_items
17 - `DefinitionList`: Second+ definition_list_items.
18 - `EnumeratedList`: Second+ enumerated_list list_items.
19 - `FieldList`: Second+ fields.
20 - `OptionList`: Second+ option_list_items.
21 - `RFC2822List`: Second+ RFC2822-style fields.
22 - `ExtensionOptions`: Parses directive option fields.
23 - `Explicit`: Second+ explicit markup constructs.
24 - `SubstitutionDef`: For embedded directives in substitution definitions.
25 - `Text`: Classifier of second line of a text block.
26 - `SpecializedText`: Superclass for continuation lines of Text-variants.
27 - `Definition`: Second line of potential definition_list_item.
28 - `Line`: Second line of overlined section title or transition marker.
29 - `Struct`: obsolete, use `types.SimpleNamespace`.
30
31:Exception classes:
32 - `MarkupError`
33 - `ParserError`
34 - `MarkupMismatch`
35
36:Functions:
37 - `escape2null()`: Return a string, escape-backslashes converted to nulls.
38 - `unescape()`: Return a string, nulls removed or restored to backslashes.
39
40:Attributes:
41 - `state_classes`: set of State classes used with `RSTStateMachine`.
42
43Parser Overview
44===============
45
46The reStructuredText parser is implemented as a recursive state machine,
47examining its input one line at a time. To understand how the parser works,
48please first become familiar with the `docutils.statemachine` module. In the
49description below, references are made to classes defined in this module;
50please see the individual classes for details.
51
52Parsing proceeds as follows:
53
541. The state machine examines each line of input, checking each of the
55 transition patterns of the state `Body`, in order, looking for a match.
56 The implicit transitions (blank lines and indentation) are checked before
57 any others. The 'text' transition is a catch-all (matches anything).
58
592. The method associated with the matched transition pattern is called.
60
61 A. Some transition methods are self-contained, appending elements to the
62 document tree (`Body.doctest` parses a doctest block). The parser's
63 current line index is advanced to the end of the element, and parsing
64 continues with step 1.
65
66 B. Other transition methods trigger the creation of a nested state machine,
67 whose job is to parse a compound construct ('indent' does a block quote,
68 'bullet' does a bullet list, 'overline' does a section [first checking
69 for a valid section header], etc.).
70
71 - In the case of lists and explicit markup, a one-off state machine is
72 created and run to parse contents of the first item.
73
74 - A new state machine is created and its initial state is set to the
75 appropriate specialized state (`BulletList` in the case of the
76 'bullet' transition; see `SpecializedBody` for more detail). This
77 state machine is run to parse the compound element (or series of
78 explicit markup elements), and returns as soon as a non-member element
79 is encountered. For example, the `BulletList` state machine ends as
80 soon as it encounters an element which is not a list item of that
81 bullet list. The optional omission of inter-element blank lines is
82 enabled by this nested state machine.
83
84 - The current line index is advanced to the end of the elements parsed,
85 and parsing continues with step 1.
86
87 C. The result of the 'text' transition depends on the next line of text.
88 The current state is changed to `Text`, under which the second line is
89 examined. If the second line is:
90
91 - Indented: The element is a definition list item, and parsing proceeds
92 similarly to step 2.B, using the `DefinitionList` state.
93
94 - A line of uniform punctuation characters: The element is a section
95 header; again, parsing proceeds as in step 2.B, and `Body` is still
96 used.
97
98 - Anything else: The element is a paragraph, which is examined for
99 inline markup and appended to the parent element. Processing
100 continues with step 1.
101"""
102
103from __future__ import annotations
104
105__docformat__ = 'reStructuredText'
106
107import re
108from types import FunctionType, MethodType
109from types import SimpleNamespace as Struct
110import warnings
111
112from docutils import nodes, statemachine, utils
113from docutils import ApplicationError, DataError
114from docutils.statemachine import StateMachineWS, StateWS
115from docutils.nodes import fully_normalize_name as normalize_name
116from docutils.nodes import unescape, whitespace_normalize_name
117import docutils.parsers.rst
118from docutils.parsers.rst import directives, languages, tableparser, roles
119from docutils.utils import escape2null, column_width, strip_combining_chars
120from docutils.utils import punctuation_chars, urischemes
121from docutils.utils import split_escaped_whitespace
122from docutils.utils._roman_numerals import (InvalidRomanNumeralError,
123 RomanNumeral)
124
125TYPE_CHECKING = False
126if TYPE_CHECKING:
127 from docutils.statemachine import StringList
128
129
130class MarkupError(DataError): pass
131class UnknownInterpretedRoleError(DataError): pass
132class InterpretedRoleNotImplementedError(DataError): pass
133class ParserError(ApplicationError): pass
134class MarkupMismatch(Exception): pass
135
136
137class RSTStateMachine(StateMachineWS):
138
139 """
140 reStructuredText's master StateMachine.
141
142 The entry point to reStructuredText parsing is the `run()` method.
143 """
144 section_level_offset: int = 0
145 """Correction term for section level determination in nested parsing.
146
147 Updated by `RSTState.nested_parse()` and used in
148 `RSTState.check_subsection()` to compensate differences when
149 nested parsing uses a detached base node with a document-wide
150 section title style hierarchy or the current node with a new,
151 independent title style hierarchy.
152 """
153
154 def run(self, input_lines, document, input_offset=0, match_titles=True,
155 inliner=None) -> None:
156 """
157 Parse `input_lines` and modify the `document` node in place.
158
159 Extend `StateMachineWS.run()`: set up parse-global data and
160 run the StateMachine.
161 """
162 self.language = languages.get_language(
163 document.settings.language_code, document.reporter)
164 self.match_titles = match_titles
165 if inliner is None:
166 inliner = Inliner()
167 inliner.init_customizations(document.settings)
168 # A collection of objects to share with nested parsers.
169 # The attributes `reporter`, `section_level`, and
170 # `section_bubble_up_kludge` will be removed in Docutils 2.0
171 self.memo = Struct(document=document,
172 reporter=document.reporter, # ignored
173 language=self.language,
174 title_styles=[],
175 section_level=0, # ignored
176 section_bubble_up_kludge=False, # ignored
177 inliner=inliner)
178 self.document = document
179 self.attach_observer(document.note_source)
180 self.reporter = self.document.reporter
181 self.node = document
182 results = StateMachineWS.run(self, input_lines, input_offset,
183 input_source=document['source'])
184 assert results == [], 'RSTStateMachine.run() results should be empty!'
185 self.node = self.memo = None # remove unneeded references
186
187
188class NestedStateMachine(RSTStateMachine):
189 """
190 StateMachine run from within other StateMachine runs, to parse nested
191 document structures.
192 """
193
194 def __init__(self, state_classes, initial_state,
195 debug=False, parent_state_machine=None) -> None:
196
197 self.parent_state_machine = parent_state_machine
198 """The instance of the parent state machine."""
199
200 super().__init__(state_classes, initial_state, debug)
201
202 def run(self, input_lines, input_offset, memo, node, match_titles=True):
203 """
204 Parse `input_lines` and populate `node`.
205
206 Extend `StateMachineWS.run()`: set up document-wide data.
207 """
208 self.match_titles = match_titles
209 self.memo = memo
210 self.document = memo.document
211 self.attach_observer(self.document.note_source)
212 self.language = memo.language
213 self.reporter = self.document.reporter
214 self.node = node
215 results = StateMachineWS.run(self, input_lines, input_offset)
216 assert results == [], ('NestedStateMachine.run() results should be '
217 'empty!')
218 return results
219
220
221class RSTState(StateWS):
222
223 """
224 reStructuredText State superclass.
225
226 Contains methods used by all State subclasses.
227 """
228
229 nested_sm = NestedStateMachine
230 nested_sm_cache = []
231
232 def __init__(self, state_machine: RSTStateMachine, debug=False) -> None:
233 self.nested_sm_kwargs = {'state_classes': state_classes,
234 'initial_state': 'Body'}
235 StateWS.__init__(self, state_machine, debug)
236
237 def runtime_init(self) -> None:
238 StateWS.runtime_init(self)
239 memo = self.state_machine.memo
240 self.memo = memo
241 self.document = memo.document
242 self.inliner = memo.inliner
243 self.reporter = self.document.reporter
244 # enable the reporter to determine source and source-line
245 if not hasattr(self.reporter, 'get_source_and_line'):
246 self.reporter.get_source_and_line = self.state_machine.get_source_and_line # noqa:E501
247
248 @property
249 def parent(self) -> nodes.Element | None:
250 return self.state_machine.node
251
252 @parent.setter
253 def parent(self, value: nodes.Element):
254 self.state_machine.node = value
255
256 def goto_line(self, abs_line_offset) -> None:
257 """
258 Jump to input line `abs_line_offset`, ignoring jumps past the end.
259 """
260 try:
261 self.state_machine.goto_line(abs_line_offset)
262 except EOFError:
263 pass
264
265 def no_match(self, context, transitions):
266 """
267 Override `StateWS.no_match` to generate a system message.
268
269 This code should never be run.
270 """
271 self.reporter.severe(
272 'Internal error: no transition pattern match. State: "%s"; '
273 'transitions: %s; context: %s; current line: %r.'
274 % (self.__class__.__name__, transitions, context,
275 self.state_machine.line))
276 return context, None, []
277
278 def bof(self, context):
279 """Called at beginning of file."""
280 return [], []
281
282 def nested_parse(self,
283 block: StringList,
284 input_offset: int,
285 node: nodes.Element|None = None,
286 match_titles: bool = False,
287 state_machine_class: StateMachineWS|None = None,
288 state_machine_kwargs: dict|None = None
289 ) -> int:
290 """
291 Parse the input `block` with a nested state-machine rooted at `node`.
292
293 :block:
294 reStructuredText source extract.
295 :input_offset:
296 Line number at start of the block.
297 :node:
298 Base node. Generated nodes will be appended to this node.
299 Default: the "current node" (`self.state_machine.node`).
300 :match_titles:
301 Allow section titles?
302 Caution: With a custom base node, this may lead to an invalid
303 or mixed up document tree. [#]_
304 :state_machine_class:
305 Default: `NestedStateMachine`.
306 :state_machine_kwargs:
307 Keyword arguments for the state-machine instantiation.
308 Default: `self.nested_sm_kwargs`.
309
310 Create a new state-machine instance if required.
311 Return new offset.
312
313 .. [#] See also ``test_parsers/test_rst/test_nested_parsing.py``
314 and Sphinx's `nested_parse_to_nodes()`__.
315
316 __ https://www.sphinx-doc.org/en/master/extdev/utils.html
317 #sphinx.util.parsing.nested_parse_to_nodes
318 """
319 if node is None:
320 node = self.state_machine.node
321 use_default = 0
322 if state_machine_class is None:
323 state_machine_class = self.nested_sm
324 use_default += 1
325 if state_machine_kwargs is None:
326 state_machine_kwargs = self.nested_sm_kwargs
327 use_default += 1
328 my_state_machine = None
329 if use_default == 2:
330 try:
331 # get cached state machine, prevent others from using it
332 my_state_machine = self.nested_sm_cache.pop()
333 except IndexError:
334 pass
335 if not my_state_machine:
336 my_state_machine = state_machine_class(
337 debug=self.debug,
338 parent_state_machine=self.state_machine,
339 **state_machine_kwargs)
340 # Check if we may use sections (with a caveat for custom nodes
341 # that may be dummies to collect children):
342 if (node == self.state_machine.node
343 and not isinstance(node, (nodes.document, nodes.section))):
344 match_titles = False # avoid invalid sections
345 if match_titles:
346 # Compensate mismatch of known title styles and number of
347 # parent sections of the base node if the document wide
348 # title styles are used with a detached base node or
349 # a new list of title styles with the current parent node:
350 l_node = len(node.section_hierarchy())
351 l_start = min(len(self.parent.section_hierarchy()),
352 len(self.memo.title_styles))
353 my_state_machine.section_level_offset = l_start - l_node
354
355 # run the state machine and populate `node`:
356 block_length = len(block)
357 my_state_machine.run(block, input_offset, self.memo,
358 node, match_titles)
359
360 if match_titles:
361 if node == self.state_machine.node:
362 # Pass on the new "current node" to parent state machines:
363 sm = self.state_machine
364 try:
365 while True:
366 sm.node = my_state_machine.node
367 sm = sm.parent_state_machine
368 except AttributeError:
369 pass
370 # clean up
371 new_offset = my_state_machine.abs_line_offset()
372 if use_default == 2:
373 self.nested_sm_cache.append(my_state_machine)
374 else:
375 my_state_machine.unlink()
376 # No `block.parent` implies disconnected -- lines aren't in sync:
377 if block.parent and (len(block) - block_length) != 0:
378 # Adjustment for block if modified in nested parse:
379 self.state_machine.next_line(len(block) - block_length)
380 return new_offset
381
382 def nested_list_parse(self, block, input_offset, node, initial_state,
383 blank_finish,
384 blank_finish_state=None,
385 extra_settings={},
386 match_titles=False, # deprecated, will be removed
387 state_machine_class=None,
388 state_machine_kwargs=None):
389 """
390 Parse the input `block` with a nested state-machine rooted at `node`.
391
392 Create a new StateMachine rooted at `node` and run it over the
393 input `block` (see also `nested_parse()`).
394 Also keep track of optional intermediate blank lines and the
395 required final one.
396
397 Return new offset and a boolean indicating whether there was a
398 blank final line.
399 """
400 if match_titles:
401 warnings.warn('The "match_titles" argument of '
402 'parsers.rst.states.RSTState.nested_list_parse() '
403 'will be ignored in Docutils 1.0 '
404 'and removed in Docutils 2.0.',
405 PendingDeprecationWarning, stacklevel=2)
406 if state_machine_class is None:
407 state_machine_class = self.nested_sm
408 if state_machine_kwargs is None:
409 state_machine_kwargs = self.nested_sm_kwargs.copy()
410 state_machine_kwargs['initial_state'] = initial_state
411 my_state_machine = state_machine_class(
412 debug=self.debug,
413 parent_state_machine=self.state_machine,
414 **state_machine_kwargs)
415 if blank_finish_state is None:
416 blank_finish_state = initial_state
417 my_state_machine.states[blank_finish_state].blank_finish = blank_finish
418 for key, value in extra_settings.items():
419 setattr(my_state_machine.states[initial_state], key, value)
420 my_state_machine.run(block, input_offset, memo=self.memo,
421 node=node, match_titles=match_titles)
422 blank_finish = my_state_machine.states[blank_finish_state].blank_finish
423 my_state_machine.unlink()
424 return my_state_machine.abs_line_offset(), blank_finish
425
426 def section(self, title, source, style, lineno, messages) -> None:
427 """Check for a valid subsection and create one if it checks out."""
428 if self.check_subsection(source, style, lineno):
429 self.new_subsection(title, lineno, messages)
430
431 def check_subsection(self, source, style, lineno) -> bool:
432 """
433 Check for a valid subsection header. Update section data in `memo`.
434
435 When a new section is reached that isn't a subsection of the current
436 section, set `self.parent` to the new section's parent section
437 (or the root node if the new section is a top-level section).
438 """
439 title_styles = self.memo.title_styles
440 parent_sections = self.parent.section_hierarchy()
441 # current section level: (0 root, 1 section, 2 subsection, ...)
442 oldlevel = (len(parent_sections)
443 + self.state_machine.section_level_offset)
444 # new section level:
445 try: # check for existing title style
446 newlevel = title_styles.index(style) + 1
447 except ValueError: # new title style
448 newlevel = len(title_styles) + 1
449 # The new level must not be deeper than an immediate child
450 # of the current level:
451 if newlevel > oldlevel + 1:
452 styles = ' '.join('/'.join(style) for style in title_styles)
453 self.parent += self.reporter.error(
454 'Inconsistent title style:'
455 f' skip from level {oldlevel} to {newlevel}.',
456 nodes.literal_block('', source),
457 nodes.paragraph('', f'Established title styles: {styles}'),
458 line=lineno)
459 return False
460 if newlevel <= oldlevel:
461 # new section is sibling or higher up in the section hierarchy
462 try:
463 new_parent = parent_sections[newlevel-oldlevel-1].parent
464 except IndexError:
465 styles = ' '.join('/'.join(style) for style in title_styles)
466 details = (f'The parent of level {newlevel} sections cannot'
467 ' be reached. The parser is at section level'
468 f' {oldlevel} but the current node has only'
469 f' {len(parent_sections)} parent section(s).'
470 '\nOne reason may be a high level'
471 ' section used in a directive that parses its'
472 ' content into a base node not attached to'
473 ' the document\n(up to Docutils 0.21,'
474 ' these sections were silently dropped).')
475 self.parent += self.reporter.error(
476 f'A level {newlevel} section cannot be used here.',
477 nodes.literal_block('', source),
478 nodes.paragraph('', f'Established title styles: {styles}'),
479 nodes.paragraph('', details),
480 line=lineno)
481 return False
482 self.parent = new_parent
483 self.memo.section_level = newlevel - 1
484 if newlevel > len(title_styles):
485 title_styles.append(style)
486 return True
487
488 def title_inconsistent(self, sourcetext, lineno):
489 # Ignored. Will be removed in Docutils 2.0.
490 error = self.reporter.error(
491 'Title level inconsistent:', nodes.literal_block('', sourcetext),
492 line=lineno)
493 return error
494
495 def new_subsection(self, title, lineno, messages):
496 """Append new subsection to document tree."""
497 section_node = nodes.section()
498 self.parent += section_node
499 textnodes, title_messages = self.inline_text(title, lineno)
500 titlenode = nodes.title(title, '', *textnodes)
501 name = normalize_name(titlenode.astext())
502 section_node['names'].append(name)
503 section_node += titlenode
504 section_node += messages
505 section_node += title_messages
506 self.document.note_implicit_target(section_node, section_node)
507 # Update state:
508 self.parent = section_node
509 self.memo.section_level += 1
510
511 def paragraph(self, lines, lineno):
512 """
513 Return a list (paragraph & messages) & a boolean: literal_block next?
514 """
515 data = '\n'.join(lines).rstrip()
516 if re.search(r'(?<!\\)(\\\\)*::$', data):
517 if len(data) == 2:
518 return [], 1
519 elif data[-3] in ' \n':
520 text = data[:-3].rstrip()
521 else:
522 text = data[:-1]
523 literalnext = 1
524 else:
525 text = data
526 literalnext = 0
527 textnodes, messages = self.inline_text(text, lineno)
528 p = nodes.paragraph(data, '', *textnodes)
529 p.source, p.line = self.state_machine.get_source_and_line(lineno)
530 return [p] + messages, literalnext
531
532 def inline_text(self, text, lineno):
533 """
534 Return 2 lists: nodes (text and inline elements), and system_messages.
535 """
536 nodes, messages = self.inliner.parse(text, lineno,
537 self.memo, self.parent)
538 return nodes, messages
539
540 def unindent_warning(self, node_name):
541 # the actual problem is one line below the current line
542 lineno = self.state_machine.abs_line_number() + 1
543 return self.reporter.warning('%s ends without a blank line; '
544 'unexpected unindent.' % node_name,
545 line=lineno)
546
547
548def build_regexp(definition, compile_patterns=True):
549 """
550 Build, compile and return a regular expression based on `definition`.
551
552 :Parameter: `definition`: a 4-tuple (group name, prefix, suffix, parts),
553 where "parts" is a list of regular expressions and/or regular
554 expression definitions to be joined into an or-group.
555 """
556 name, prefix, suffix, parts = definition
557 part_strings = []
558 for part in parts:
559 if isinstance(part, tuple):
560 part_strings.append(build_regexp(part, None))
561 else:
562 part_strings.append(part)
563 or_group = '|'.join(part_strings)
564 regexp = '%(prefix)s(?P<%(name)s>%(or_group)s)%(suffix)s' % locals()
565 if compile_patterns:
566 return re.compile(regexp)
567 else:
568 return regexp
569
570
571class Inliner:
572
573 """
574 Parse inline markup; call the `parse()` method.
575 """
576
577 def __init__(self) -> None:
578 self.implicit_dispatch = []
579 """List of (pattern, bound method) tuples, used by
580 `self.implicit_inline`."""
581
582 def init_customizations(self, settings) -> None:
583 # lookahead and look-behind expressions for inline markup rules
584 if getattr(settings, 'character_level_inline_markup', False):
585 start_string_prefix = '(^|(?<!\x00))'
586 end_string_suffix = ''
587 else:
588 start_string_prefix = ('(^|(?<=\\s|[%s%s]))' %
589 (punctuation_chars.openers,
590 punctuation_chars.delimiters))
591 end_string_suffix = ('($|(?=\\s|[\x00%s%s%s]))' %
592 (punctuation_chars.closing_delimiters,
593 punctuation_chars.delimiters,
594 punctuation_chars.closers))
595 args = locals().copy()
596 args.update(vars(self.__class__))
597
598 parts = ('initial_inline', start_string_prefix, '',
599 [
600 ('start', '', self.non_whitespace_after, # simple start-strings
601 [r'\*\*', # strong
602 r'\*(?!\*)', # emphasis but not strong
603 r'``', # literal
604 r'_`', # inline internal target
605 r'\|(?!\|)'] # substitution reference
606 ),
607 ('whole', '', end_string_suffix, # whole constructs
608 [ # reference name & end-string
609 r'(?P<refname>%s)(?P<refend>__?)' % self.simplename,
610 ('footnotelabel', r'\[', r'(?P<fnend>\]_)',
611 [r'[0-9]+', # manually numbered
612 r'\#(%s)?' % self.simplename, # auto-numbered (w/ label?)
613 r'\*', # auto-symbol
614 r'(?P<citationlabel>%s)' % self.simplename, # citation ref
615 ]
616 )
617 ]
618 ),
619 ('backquote', # interpreted text or phrase reference
620 '(?P<role>(:%s:)?)' % self.simplename, # optional role
621 self.non_whitespace_after,
622 ['`(?!`)'] # but not literal
623 )
624 ]
625 )
626 self.start_string_prefix = start_string_prefix
627 self.end_string_suffix = end_string_suffix
628 self.parts = parts
629
630 self.patterns = Struct(
631 initial=build_regexp(parts),
632 emphasis=re.compile(self.non_whitespace_escape_before
633 + r'(\*)' + end_string_suffix),
634 strong=re.compile(self.non_whitespace_escape_before
635 + r'(\*\*)' + end_string_suffix),
636 interpreted_or_phrase_ref=re.compile(
637 r"""
638 %(non_unescaped_whitespace_escape_before)s
639 (
640 `
641 (?P<suffix>
642 (?P<role>:%(simplename)s:)?
643 (?P<refend>__?)?
644 )
645 )
646 %(end_string_suffix)s
647 """ % args, re.VERBOSE),
648 embedded_link=re.compile(
649 r"""
650 (
651 (?:[ \n]+|^) # spaces or beginning of line/string
652 < # open bracket
653 %(non_whitespace_after)s
654 (([^<>]|\x00[<>])+) # anything but unescaped angle brackets
655 %(non_whitespace_escape_before)s
656 > # close bracket
657 )
658 $ # end of string
659 """ % args, re.VERBOSE),
660 literal=re.compile(self.non_whitespace_before + '(``)'
661 + end_string_suffix),
662 target=re.compile(self.non_whitespace_escape_before
663 + r'(`)' + end_string_suffix),
664 substitution_ref=re.compile(self.non_whitespace_escape_before
665 + r'(\|_{0,2})'
666 + end_string_suffix),
667 email=re.compile(self.email_pattern % args + '$',
668 re.VERBOSE),
669 uri=re.compile(
670 (r"""
671 %(start_string_prefix)s
672 (?P<whole>
673 (?P<absolute> # absolute URI
674 (?P<scheme> # scheme (http, ftp, mailto)
675 [a-zA-Z][a-zA-Z0-9.+-]*
676 )
677 :
678 (
679 ( # either:
680 (//?)? # hierarchical URI
681 %(uric)s* # URI characters
682 %(uri_end)s # final URI char
683 )
684 ( # optional query
685 \?%(uric)s*
686 %(uri_end)s
687 )?
688 ( # optional fragment
689 \#%(uric)s*
690 %(uri_end)s
691 )?
692 )
693 )
694 | # *OR*
695 (?P<email> # email address
696 """ + self.email_pattern + r"""
697 )
698 )
699 %(end_string_suffix)s
700 """) % args, re.VERBOSE),
701 pep=re.compile(
702 r"""
703 %(start_string_prefix)s
704 (
705 (pep-(?P<pepnum1>\d+)(.txt)?) # reference to source file
706 |
707 (PEP\s+(?P<pepnum2>\d+)) # reference by name
708 )
709 %(end_string_suffix)s""" % args, re.VERBOSE),
710 rfc=re.compile(
711 r"""
712 %(start_string_prefix)s
713 (RFC(-|\s+)?(?P<rfcnum>\d+))
714 %(end_string_suffix)s""" % args, re.VERBOSE))
715
716 self.implicit_dispatch.append((self.patterns.uri,
717 self.standalone_uri))
718 if settings.pep_references:
719 self.implicit_dispatch.append((self.patterns.pep,
720 self.pep_reference))
721 if settings.rfc_references:
722 self.implicit_dispatch.append((self.patterns.rfc,
723 self.rfc_reference))
724
725 def parse(self, text, lineno, memo, parent):
726 # Needs to be refactored for nested inline markup.
727 # Add nested_parse() method?
728 """
729 Return 2 lists: nodes (text and inline elements), and system_messages.
730
731 Using `self.patterns.initial`, a pattern which matches start-strings
732 (emphasis, strong, interpreted, phrase reference, literal,
733 substitution reference, and inline target) and complete constructs
734 (simple reference, footnote reference), search for a candidate. When
735 one is found, check for validity (e.g., not a quoted '*' character).
736 If valid, search for the corresponding end string if applicable, and
737 check it for validity. If not found or invalid, generate a warning
738 and ignore the start-string. Implicit inline markup (e.g. standalone
739 URIs) is found last.
740
741 :text: source string
742 :lineno: absolute line number, cf. `statemachine.get_source_and_line()`
743 """
744 self.document = memo.document
745 self.language = memo.language
746 self.reporter = self.document.reporter
747 self.parent = parent
748 pattern_search = self.patterns.initial.search
749 dispatch = self.dispatch
750 remaining = escape2null(text)
751 processed = []
752 unprocessed = []
753 messages = []
754 while remaining:
755 match = pattern_search(remaining)
756 if match:
757 groups = match.groupdict()
758 method = dispatch[groups['start'] or groups['backquote']
759 or groups['refend'] or groups['fnend']]
760 before, inlines, remaining, sysmessages = method(self, match,
761 lineno)
762 unprocessed.append(before)
763 messages += sysmessages
764 if inlines:
765 processed += self.implicit_inline(''.join(unprocessed),
766 lineno)
767 processed += inlines
768 unprocessed = []
769 else:
770 break
771 remaining = ''.join(unprocessed) + remaining
772 if remaining:
773 processed += self.implicit_inline(remaining, lineno)
774 return processed, messages
775
776 # Inline object recognition
777 # -------------------------
778 # See also init_customizations().
779 non_whitespace_before = r'(?<!\s)'
780 non_whitespace_escape_before = r'(?<![\s\x00])'
781 non_unescaped_whitespace_escape_before = r'(?<!(?<!\x00)[\s\x00])'
782 non_whitespace_after = r'(?!\s)'
783 # Alphanumerics with isolated internal [-._+:] chars (i.e. not 2 together):
784 simplename = r'(?:(?!_)\w)+(?:[-._+:](?:(?!_)\w)+)*'
785 # Valid URI characters (see RFC 2396 & RFC 2732);
786 # final \x00 allows backslash escapes in URIs:
787 uric = r"""[-_.!~*'()[\];/:@&=+$,%a-zA-Z0-9\x00]"""
788 # Delimiter indicating the end of a URI (not part of the URI):
789 uri_end_delim = r"""[>]"""
790 # Last URI character; same as uric but no punctuation:
791 urilast = r"""[_~*/=+a-zA-Z0-9]"""
792 # End of a URI (either 'urilast' or 'uric followed by a
793 # uri_end_delim'):
794 uri_end = r"""(?:%(urilast)s|%(uric)s(?=%(uri_end_delim)s))""" % locals()
795 emailc = r"""[-_!~*'{|}/#?^`&=+$%a-zA-Z0-9\x00]"""
796 email_pattern = r"""
797 %(emailc)s+(?:\.%(emailc)s+)* # name
798 (?<!\x00)@ # at
799 %(emailc)s+(?:\.%(emailc)s*)* # host
800 %(uri_end)s # final URI char
801 """
802
803 def quoted_start(self, match):
804 """Test if inline markup start-string is 'quoted'.
805
806 'Quoted' in this context means the start-string is enclosed in a pair
807 of matching opening/closing delimiters (not necessarily quotes)
808 or at the end of the match.
809 """
810 string = match.string
811 start = match.start()
812 if start == 0: # start-string at beginning of text
813 return False
814 prestart = string[start - 1]
815 try:
816 poststart = string[match.end()]
817 except IndexError: # start-string at end of text
818 return True # not "quoted" but no markup start-string either
819 return punctuation_chars.match_chars(prestart, poststart)
820
821 def inline_obj(self, match, lineno, end_pattern, nodeclass,
822 restore_backslashes=False):
823 string = match.string
824 matchstart = match.start('start')
825 matchend = match.end('start')
826 if self.quoted_start(match):
827 return string[:matchend], [], string[matchend:], [], ''
828 endmatch = end_pattern.search(string[matchend:])
829 if endmatch and endmatch.start(1): # 1 or more chars
830 text = endmatch.string[:endmatch.start(1)]
831 if restore_backslashes:
832 text = unescape(text, True)
833 textend = matchend + endmatch.end(1)
834 rawsource = unescape(string[matchstart:textend], True)
835 node = nodeclass(rawsource, text)
836 return (string[:matchstart], [node],
837 string[textend:], [], endmatch.group(1))
838 msg = self.reporter.warning(
839 'Inline %s start-string without end-string.'
840 % nodeclass.__name__, line=lineno)
841 text = unescape(string[matchstart:matchend], True)
842 prb = self.problematic(text, text, msg)
843 return string[:matchstart], [prb], string[matchend:], [msg], ''
844
845 def problematic(self, text, rawsource, message):
846 msgid = self.document.set_id(message, self.parent)
847 problematic = nodes.problematic(rawsource, text, refid=msgid)
848 prbid = self.document.set_id(problematic)
849 message.add_backref(prbid)
850 return problematic
851
852 def emphasis(self, match, lineno):
853 before, inlines, remaining, sysmessages, endstring = self.inline_obj(
854 match, lineno, self.patterns.emphasis, nodes.emphasis)
855 return before, inlines, remaining, sysmessages
856
857 def strong(self, match, lineno):
858 before, inlines, remaining, sysmessages, endstring = self.inline_obj(
859 match, lineno, self.patterns.strong, nodes.strong)
860 return before, inlines, remaining, sysmessages
861
862 def interpreted_or_phrase_ref(self, match, lineno):
863 end_pattern = self.patterns.interpreted_or_phrase_ref
864 string = match.string
865 matchstart = match.start('backquote')
866 matchend = match.end('backquote')
867 rolestart = match.start('role')
868 role = match.group('role')
869 position = ''
870 if role:
871 role = role[1:-1]
872 position = 'prefix'
873 elif self.quoted_start(match):
874 return string[:matchend], [], string[matchend:], []
875 endmatch = end_pattern.search(string[matchend:])
876 if endmatch and endmatch.start(1): # 1 or more chars
877 textend = matchend + endmatch.end()
878 if endmatch.group('role'):
879 if role:
880 msg = self.reporter.warning(
881 'Multiple roles in interpreted text (both '
882 'prefix and suffix present; only one allowed).',
883 line=lineno)
884 text = unescape(string[rolestart:textend], True)
885 prb = self.problematic(text, text, msg)
886 return string[:rolestart], [prb], string[textend:], [msg]
887 role = endmatch.group('suffix')[1:-1]
888 position = 'suffix'
889 escaped = endmatch.string[:endmatch.start(1)]
890 rawsource = unescape(string[matchstart:textend], True)
891 if rawsource[-1:] == '_':
892 if role:
893 msg = self.reporter.warning(
894 'Mismatch: both interpreted text role %s and '
895 'reference suffix.' % position, line=lineno)
896 text = unescape(string[rolestart:textend], True)
897 prb = self.problematic(text, text, msg)
898 return string[:rolestart], [prb], string[textend:], [msg]
899 return self.phrase_ref(string[:matchstart], string[textend:],
900 rawsource, escaped)
901 else:
902 rawsource = unescape(string[rolestart:textend], True)
903 nodelist, messages = self.interpreted(rawsource, escaped, role,
904 lineno)
905 return (string[:rolestart], nodelist,
906 string[textend:], messages)
907 msg = self.reporter.warning(
908 'Inline interpreted text or phrase reference start-string '
909 'without end-string.', line=lineno)
910 text = unescape(string[matchstart:matchend], True)
911 prb = self.problematic(text, text, msg)
912 return string[:matchstart], [prb], string[matchend:], [msg]
913
914 def phrase_ref(self, before, after, rawsource, escaped, text=None):
915 # `text` is ignored (since 0.16)
916 match = self.patterns.embedded_link.search(escaped)
917 if match: # embedded <URI> or <alias_>
918 text = escaped[:match.start(0)]
919 unescaped = unescape(text)
920 rawtext = unescape(text, True)
921 aliastext = match.group(2)
922 rawaliastext = unescape(aliastext, True)
923 underscore_escaped = rawaliastext.endswith(r'\_')
924 if (aliastext.endswith('_')
925 and not (underscore_escaped
926 or self.patterns.uri.match(aliastext))):
927 aliastype = 'name'
928 alias = normalize_name(unescape(aliastext[:-1]))
929 target = nodes.target(match.group(1), refname=alias)
930 else:
931 aliastype = 'uri'
932 # remove unescaped whitespace
933 alias_parts = split_escaped_whitespace(match.group(2))
934 alias = ' '.join(''.join(part.split())
935 for part in alias_parts)
936 alias = self.adjust_uri(unescape(alias))
937 if alias.endswith(r'\_'):
938 alias = alias[:-2] + '_'
939 target = nodes.target(match.group(1), refuri=alias)
940 target.referenced = 1
941 if not aliastext:
942 raise ApplicationError('problem with embedded link: %r'
943 % aliastext)
944 if not text:
945 text = alias
946 unescaped = unescape(text)
947 rawtext = rawaliastext
948 else:
949 text = escaped
950 unescaped = unescape(text)
951 target = None
952 rawtext = unescape(escaped, True)
953
954 refname = normalize_name(unescaped)
955 reference = nodes.reference(rawsource, text)
956 reference[0].rawsource = rawtext
957
958 node_list = [reference]
959
960 if rawsource[-2:] == '__':
961 if target and (aliastype == 'name'):
962 reference['refname'] = alias
963 self.document.note_refname(reference)
964 # self.document.note_indirect_target(target) # required?
965 elif target and (aliastype == 'uri'):
966 reference['refuri'] = alias
967 else:
968 reference['anonymous'] = True
969 else:
970 if target:
971 target['names'].append(refname)
972 if aliastype == 'name':
973 reference['refname'] = alias
974 self.document.note_indirect_target(target)
975 self.document.note_refname(reference)
976 else:
977 reference['refuri'] = alias
978 # target.note_referenced_by(name=refname)
979 self.document.note_implicit_target(target, self.parent)
980 node_list.append(target)
981 else:
982 reference['refname'] = refname
983 self.document.note_refname(reference)
984 return before, node_list, after, []
985
986 def adjust_uri(self, uri):
987 match = self.patterns.email.match(uri)
988 if match:
989 return 'mailto:' + uri
990 else:
991 return uri
992
993 def interpreted(self, rawsource, text, role, lineno):
994 role_fn, messages = roles.role(role, self.language, lineno,
995 self.reporter)
996 if role_fn:
997 nodes, messages2 = role_fn(role, rawsource, text, lineno, self)
998 return nodes, messages + messages2
999 else:
1000 msg = self.reporter.error(
1001 'Unknown interpreted text role "%s".' % role,
1002 line=lineno)
1003 return ([self.problematic(rawsource, rawsource, msg)],
1004 messages + [msg])
1005
1006 def literal(self, match, lineno):
1007 before, inlines, remaining, sysmessages, endstring = self.inline_obj(
1008 match, lineno, self.patterns.literal, nodes.literal,
1009 restore_backslashes=True)
1010 return before, inlines, remaining, sysmessages
1011
1012 def inline_internal_target(self, match, lineno):
1013 before, inlines, remaining, sysmessages, endstring = self.inline_obj(
1014 match, lineno, self.patterns.target, nodes.target)
1015 if inlines and isinstance(inlines[0], nodes.target):
1016 assert len(inlines) == 1
1017 target = inlines[0]
1018 name = normalize_name(target.astext())
1019 target['names'].append(name)
1020 self.document.note_explicit_target(target, self.parent)
1021 return before, inlines, remaining, sysmessages
1022
1023 def substitution_reference(self, match, lineno):
1024 before, inlines, remaining, sysmessages, endstring = self.inline_obj(
1025 match, lineno, self.patterns.substitution_ref,
1026 nodes.substitution_reference)
1027 if len(inlines) == 1:
1028 subref_node = inlines[0]
1029 if isinstance(subref_node, nodes.substitution_reference):
1030 subref_text = subref_node.astext()
1031 self.document.note_substitution_ref(subref_node, subref_text)
1032 if endstring[-1:] == '_':
1033 reference_node = nodes.reference(
1034 '|%s%s' % (subref_text, endstring), '')
1035 if endstring[-2:] == '__':
1036 reference_node['anonymous'] = True
1037 else:
1038 reference_node['refname'] = normalize_name(subref_text)
1039 self.document.note_refname(reference_node)
1040 reference_node += subref_node
1041 inlines = [reference_node]
1042 return before, inlines, remaining, sysmessages
1043
1044 def footnote_reference(self, match, lineno):
1045 """
1046 Handles `nodes.footnote_reference` and `nodes.citation_reference`
1047 elements.
1048 """
1049 label = match.group('footnotelabel')
1050 refname = normalize_name(label)
1051 string = match.string
1052 before = string[:match.start('whole')]
1053 remaining = string[match.end('whole'):]
1054 if match.group('citationlabel'):
1055 refnode = nodes.citation_reference('[%s]_' % label,
1056 refname=refname)
1057 refnode += nodes.Text(label)
1058 self.document.note_citation_ref(refnode)
1059 else:
1060 refnode = nodes.footnote_reference('[%s]_' % label)
1061 if refname[0] == '#':
1062 refname = refname[1:]
1063 refnode['auto'] = 1
1064 self.document.note_autofootnote_ref(refnode)
1065 elif refname == '*':
1066 refname = ''
1067 refnode['auto'] = '*'
1068 self.document.note_symbol_footnote_ref(
1069 refnode)
1070 else:
1071 refnode += nodes.Text(label)
1072 if refname:
1073 refnode['refname'] = refname
1074 self.document.note_footnote_ref(refnode)
1075 if utils.get_trim_footnote_ref_space(self.document.settings):
1076 before = before.rstrip()
1077 return before, [refnode], remaining, []
1078
1079 def reference(self, match, lineno, anonymous=False):
1080 referencename = match.group('refname')
1081 refname = normalize_name(referencename)
1082 referencenode = nodes.reference(
1083 referencename + match.group('refend'), referencename)
1084 referencenode[0].rawsource = referencename
1085 if anonymous:
1086 referencenode['anonymous'] = True
1087 else:
1088 referencenode['refname'] = refname
1089 self.document.note_refname(referencenode)
1090 string = match.string
1091 matchstart = match.start('whole')
1092 matchend = match.end('whole')
1093 return string[:matchstart], [referencenode], string[matchend:], []
1094
1095 def anonymous_reference(self, match, lineno):
1096 return self.reference(match, lineno, anonymous=True)
1097
1098 def standalone_uri(self, match, lineno):
1099 if (not match.group('scheme')
1100 or match.group('scheme').lower() in urischemes.schemes):
1101 if match.group('email'):
1102 addscheme = 'mailto:'
1103 else:
1104 addscheme = ''
1105 text = match.group('whole')
1106 refuri = addscheme + unescape(text)
1107 reference = nodes.reference(unescape(text, True), text,
1108 refuri=refuri)
1109 return [reference]
1110 else: # not a valid scheme
1111 raise MarkupMismatch
1112
1113 def pep_reference(self, match, lineno):
1114 text = match.group(0)
1115 if text.startswith('pep-'):
1116 pepnum = int(unescape(match.group('pepnum1')))
1117 elif text.startswith('PEP'):
1118 pepnum = int(unescape(match.group('pepnum2')))
1119 else:
1120 raise MarkupMismatch
1121 ref = (self.document.settings.pep_base_url
1122 + self.document.settings.pep_file_url_template % pepnum)
1123 return [nodes.reference(unescape(text, True), text, refuri=ref)]
1124
1125 rfc_url = 'rfc%d.html'
1126
1127 def rfc_reference(self, match, lineno):
1128 text = match.group(0)
1129 if text.startswith('RFC'):
1130 rfcnum = int(unescape(match.group('rfcnum')))
1131 ref = self.document.settings.rfc_base_url + self.rfc_url % rfcnum
1132 else:
1133 raise MarkupMismatch
1134 return [nodes.reference(unescape(text, True), text, refuri=ref)]
1135
1136 def implicit_inline(self, text, lineno):
1137 """
1138 Check each of the patterns in `self.implicit_dispatch` for a match,
1139 and dispatch to the stored method for the pattern. Recursively check
1140 the text before and after the match. Return a list of `nodes.Text`
1141 and inline element nodes.
1142 """
1143 if not text:
1144 return []
1145 for pattern, method in self.implicit_dispatch:
1146 match = pattern.search(text)
1147 if match:
1148 try:
1149 # Must recurse on strings before *and* after the match;
1150 # there may be multiple patterns.
1151 return (self.implicit_inline(text[:match.start()], lineno)
1152 + method(match, lineno)
1153 + self.implicit_inline(text[match.end():], lineno))
1154 except MarkupMismatch:
1155 pass
1156 return [nodes.Text(text)]
1157
1158 dispatch = {'*': emphasis,
1159 '**': strong,
1160 '`': interpreted_or_phrase_ref,
1161 '``': literal,
1162 '_`': inline_internal_target,
1163 ']_': footnote_reference,
1164 '|': substitution_reference,
1165 '_': reference,
1166 '__': anonymous_reference}
1167
1168
1169def _loweralpha_to_int(s, _zero=(ord('a')-1)):
1170 return ord(s) - _zero
1171
1172
1173def _upperalpha_to_int(s, _zero=(ord('A')-1)):
1174 return ord(s) - _zero
1175
1176
1177class Body(RSTState):
1178
1179 """
1180 Generic classifier of the first line of a block.
1181 """
1182
1183 double_width_pad_char = tableparser.TableParser.double_width_pad_char
1184 """Padding character for East Asian double-width text."""
1185
1186 enum = Struct()
1187 """Enumerated list parsing information."""
1188
1189 enum.formatinfo = {
1190 'parens': Struct(prefix='(', suffix=')', start=1, end=-1),
1191 'rparen': Struct(prefix='', suffix=')', start=0, end=-1),
1192 'period': Struct(prefix='', suffix='.', start=0, end=-1)}
1193 enum.formats = enum.formatinfo.keys()
1194 enum.sequences = ['arabic', 'loweralpha', 'upperalpha',
1195 'lowerroman', 'upperroman'] # ORDERED!
1196 enum.sequencepats = {'arabic': '[0-9]+',
1197 'loweralpha': '[a-z]',
1198 'upperalpha': '[A-Z]',
1199 'lowerroman': '[ivxlcdm]+',
1200 'upperroman': '[IVXLCDM]+'}
1201 enum.converters = {'arabic': int,
1202 'loweralpha': _loweralpha_to_int,
1203 'upperalpha': _upperalpha_to_int,
1204 'lowerroman': RomanNumeral.from_string,
1205 'upperroman': RomanNumeral.from_string}
1206
1207 enum.sequenceregexps = {}
1208 for sequence in enum.sequences:
1209 enum.sequenceregexps[sequence] = re.compile(
1210 enum.sequencepats[sequence] + '$')
1211
1212 grid_table_top_pat = re.compile(r'\+-[-+]+-\+ *$')
1213 """Matches the top (& bottom) of a full table)."""
1214
1215 simple_table_top_pat = re.compile('=+( +=+)+ *$')
1216 """Matches the top of a simple table."""
1217
1218 simple_table_border_pat = re.compile('=+[ =]*$')
1219 """Matches the bottom & header bottom of a simple table."""
1220
1221 pats = {}
1222 """Fragments of patterns used by transitions."""
1223
1224 pats['nonalphanum7bit'] = '[!-/:-@[-`{-~]'
1225 pats['alpha'] = '[a-zA-Z]'
1226 pats['alphanum'] = '[a-zA-Z0-9]'
1227 pats['alphanumplus'] = '[a-zA-Z0-9_-]'
1228 pats['enum'] = ('(%(arabic)s|%(loweralpha)s|%(upperalpha)s|%(lowerroman)s'
1229 '|%(upperroman)s|#)' % enum.sequencepats)
1230 pats['optname'] = '%(alphanum)s%(alphanumplus)s*' % pats
1231 # @@@ Loosen up the pattern? Allow Unicode?
1232 pats['optarg'] = '(%(alpha)s%(alphanumplus)s*|<[^<>]+>)' % pats
1233 pats['shortopt'] = r'(-|\+)%(alphanum)s( ?%(optarg)s)?' % pats
1234 pats['longopt'] = r'(--|/)%(optname)s([ =]%(optarg)s)?' % pats
1235 pats['option'] = r'(%(shortopt)s|%(longopt)s)' % pats
1236
1237 for format in enum.formats:
1238 pats[format] = '(?P<%s>%s%s%s)' % (
1239 format, re.escape(enum.formatinfo[format].prefix),
1240 pats['enum'], re.escape(enum.formatinfo[format].suffix))
1241
1242 patterns = {
1243 'bullet': '[-+*\u2022\u2023\u2043]( +|$)',
1244 'enumerator': r'(%(parens)s|%(rparen)s|%(period)s)( +|$)' % pats,
1245 'field_marker': r':(?![: ])([^:\\]|\\.|:(?!([ `]|$)))*(?<! ):( +|$)',
1246 'option_marker': r'%(option)s(, %(option)s)*( +| ?$)' % pats,
1247 'doctest': r'>>>( +|$)',
1248 'line_block': r'\|( +|$)',
1249 'grid_table_top': grid_table_top_pat,
1250 'simple_table_top': simple_table_top_pat,
1251 'explicit_markup': r'\.\.( +|$)',
1252 'anonymous': r'__( +|$)',
1253 'line': r'(%(nonalphanum7bit)s)\1* *$' % pats,
1254 'text': r''}
1255 initial_transitions = (
1256 'bullet',
1257 'enumerator',
1258 'field_marker',
1259 'option_marker',
1260 'doctest',
1261 'line_block',
1262 'grid_table_top',
1263 'simple_table_top',
1264 'explicit_markup',
1265 'anonymous',
1266 'line',
1267 'text')
1268
1269 def indent(self, match, context, next_state):
1270 """Block quote."""
1271 (indented, indent, line_offset, blank_finish
1272 ) = self.state_machine.get_indented()
1273 elements = self.block_quote(indented, line_offset)
1274 self.parent += elements
1275 if not blank_finish:
1276 self.parent += self.unindent_warning('Block quote')
1277 return context, next_state, []
1278
1279 def block_quote(self, indented, line_offset):
1280 elements = []
1281 while indented:
1282 blockquote = nodes.block_quote(rawsource='\n'.join(indented))
1283 (blockquote.source, blockquote.line
1284 ) = self.state_machine.get_source_and_line(line_offset+1)
1285 (blockquote_lines,
1286 attribution_lines,
1287 attribution_offset,
1288 indented,
1289 new_line_offset) = self.split_attribution(indented, line_offset)
1290 self.nested_parse(blockquote_lines, line_offset, blockquote)
1291 elements.append(blockquote)
1292 if attribution_lines:
1293 attribution, messages = self.parse_attribution(
1294 attribution_lines, line_offset+attribution_offset)
1295 blockquote += attribution
1296 elements += messages
1297 line_offset = new_line_offset
1298 while indented and not indented[0]:
1299 indented = indented[1:]
1300 line_offset += 1
1301 return elements
1302
1303 # U+2014 is an em-dash:
1304 attribution_pattern = re.compile('(---?(?!-)|\u2014) *(?=[^ \\n])')
1305
1306 def split_attribution(self, indented, line_offset):
1307 """
1308 Check for a block quote attribution and split it off:
1309
1310 * First line after a blank line must begin with a dash ("--", "---",
1311 em-dash; matches `self.attribution_pattern`).
1312 * Every line after that must have consistent indentation.
1313 * Attributions must be preceded by block quote content.
1314
1315 Return a tuple of: (block quote content lines, attribution lines,
1316 attribution offset, remaining indented lines, remaining lines offset).
1317 """
1318 blank = None
1319 nonblank_seen = False
1320 for i in range(len(indented)):
1321 line = indented[i].rstrip()
1322 if line:
1323 if nonblank_seen and blank == i - 1: # last line blank
1324 match = self.attribution_pattern.match(line)
1325 if match:
1326 attribution_end, indent = self.check_attribution(
1327 indented, i)
1328 if attribution_end:
1329 a_lines = indented[i:attribution_end]
1330 a_lines.trim_left(match.end(), end=1)
1331 a_lines.trim_left(indent, start=1)
1332 return (indented[:i], a_lines,
1333 i, indented[attribution_end:],
1334 line_offset + attribution_end)
1335 nonblank_seen = True
1336 else:
1337 blank = i
1338 else:
1339 return indented, None, None, None, None
1340
1341 def check_attribution(self, indented, attribution_start):
1342 """
1343 Check attribution shape.
1344 Return the index past the end of the attribution, and the indent.
1345 """
1346 indent = None
1347 i = attribution_start + 1
1348 for i in range(attribution_start + 1, len(indented)):
1349 line = indented[i].rstrip()
1350 if not line:
1351 break
1352 if indent is None:
1353 indent = len(line) - len(line.lstrip())
1354 elif len(line) - len(line.lstrip()) != indent:
1355 return None, None # bad shape; not an attribution
1356 else:
1357 # return index of line after last attribution line:
1358 i += 1
1359 return i, (indent or 0)
1360
1361 def parse_attribution(self, indented, line_offset):
1362 text = '\n'.join(indented).rstrip()
1363 lineno = 1 + line_offset # line_offset is zero-based
1364 textnodes, messages = self.inline_text(text, lineno)
1365 node = nodes.attribution(text, '', *textnodes)
1366 node.source, node.line = self.state_machine.get_source_and_line(lineno)
1367 return node, messages
1368
1369 def bullet(self, match, context, next_state):
1370 """Bullet list item."""
1371 ul = nodes.bullet_list()
1372 ul.source, ul.line = self.state_machine.get_source_and_line()
1373 self.parent += ul
1374 ul['bullet'] = match.string[0]
1375 i, blank_finish = self.list_item(match.end())
1376 ul += i
1377 offset = self.state_machine.line_offset + 1 # next line
1378 new_line_offset, blank_finish = self.nested_list_parse(
1379 self.state_machine.input_lines[offset:],
1380 input_offset=self.state_machine.abs_line_offset() + 1,
1381 node=ul, initial_state='BulletList',
1382 blank_finish=blank_finish)
1383 self.goto_line(new_line_offset)
1384 if not blank_finish:
1385 self.parent += self.unindent_warning('Bullet list')
1386 return [], next_state, []
1387
1388 def list_item(self, indent):
1389 src, srcline = self.state_machine.get_source_and_line()
1390 if self.state_machine.line[indent:]:
1391 indented, line_offset, blank_finish = (
1392 self.state_machine.get_known_indented(indent))
1393 else:
1394 indented, indent, line_offset, blank_finish = (
1395 self.state_machine.get_first_known_indented(indent))
1396 listitem = nodes.list_item('\n'.join(indented))
1397 listitem.source, listitem.line = src, srcline
1398 if indented:
1399 self.nested_parse(indented, input_offset=line_offset,
1400 node=listitem)
1401 return listitem, blank_finish
1402
1403 def enumerator(self, match, context, next_state):
1404 """Enumerated List Item"""
1405 format, sequence, text, ordinal = self.parse_enumerator(match)
1406 if not self.is_enumerated_list_item(ordinal, sequence, format):
1407 raise statemachine.TransitionCorrection('text')
1408 enumlist = nodes.enumerated_list()
1409 (enumlist.source,
1410 enumlist.line) = self.state_machine.get_source_and_line()
1411 self.parent += enumlist
1412 if sequence == '#':
1413 enumlist['enumtype'] = 'arabic'
1414 else:
1415 enumlist['enumtype'] = sequence
1416 enumlist['prefix'] = self.enum.formatinfo[format].prefix
1417 enumlist['suffix'] = self.enum.formatinfo[format].suffix
1418 if ordinal != 1:
1419 enumlist['start'] = ordinal
1420 msg = self.reporter.info(
1421 'Enumerated list start value not ordinal-1: "%s" (ordinal %s)'
1422 % (text, ordinal), base_node=enumlist)
1423 self.parent += msg
1424 listitem, blank_finish = self.list_item(match.end())
1425 enumlist += listitem
1426 offset = self.state_machine.line_offset + 1 # next line
1427 newline_offset, blank_finish = self.nested_list_parse(
1428 self.state_machine.input_lines[offset:],
1429 input_offset=self.state_machine.abs_line_offset() + 1,
1430 node=enumlist, initial_state='EnumeratedList',
1431 blank_finish=blank_finish,
1432 extra_settings={'lastordinal': ordinal,
1433 'format': format,
1434 'auto': sequence == '#'})
1435 self.goto_line(newline_offset)
1436 if not blank_finish:
1437 self.parent += self.unindent_warning('Enumerated list')
1438 return [], next_state, []
1439
1440 def parse_enumerator(self, match, expected_sequence=None):
1441 """
1442 Analyze an enumerator and return the results.
1443
1444 :Return:
1445 - the enumerator format ('period', 'parens', or 'rparen'),
1446 - the sequence used ('arabic', 'loweralpha', 'upperroman', etc.),
1447 - the text of the enumerator, stripped of formatting, and
1448 - the ordinal value of the enumerator ('a' -> 1, 'ii' -> 2, etc.;
1449 ``None`` is returned for invalid enumerator text).
1450
1451 The enumerator format has already been determined by the regular
1452 expression match. If `expected_sequence` is given, that sequence is
1453 tried first. If not, we check for Roman numeral 1. This way,
1454 single-character Roman numerals (which are also alphabetical) can be
1455 matched. If no sequence has been matched, all sequences are checked in
1456 order.
1457 """
1458 groupdict = match.groupdict()
1459 sequence = ''
1460 for format in self.enum.formats:
1461 if groupdict[format]: # was this the format matched?
1462 break # yes; keep `format`
1463 else: # shouldn't happen
1464 raise ParserError('enumerator format not matched')
1465 text = groupdict[format][self.enum.formatinfo[format].start # noqa: E203,E501
1466 : self.enum.formatinfo[format].end]
1467 if text == '#':
1468 sequence = '#'
1469 elif expected_sequence:
1470 try:
1471 if self.enum.sequenceregexps[expected_sequence].match(text):
1472 sequence = expected_sequence
1473 except KeyError: # shouldn't happen
1474 raise ParserError('unknown enumerator sequence: %s'
1475 % sequence)
1476 elif text == 'i':
1477 sequence = 'lowerroman'
1478 elif text == 'I':
1479 sequence = 'upperroman'
1480 if not sequence:
1481 for sequence in self.enum.sequences:
1482 if self.enum.sequenceregexps[sequence].match(text):
1483 break
1484 else: # shouldn't happen
1485 raise ParserError('enumerator sequence not matched')
1486 if sequence == '#':
1487 ordinal = 1
1488 else:
1489 try:
1490 ordinal = int(self.enum.converters[sequence](text))
1491 except InvalidRomanNumeralError:
1492 ordinal = None
1493 return format, sequence, text, ordinal
1494
1495 def is_enumerated_list_item(self, ordinal, sequence, format):
1496 """
1497 Check validity based on the ordinal value and the second line.
1498
1499 Return true if the ordinal is valid and the second line is blank,
1500 indented, or starts with the next enumerator or an auto-enumerator.
1501 """
1502 if ordinal is None:
1503 return None
1504 try:
1505 next_line = self.state_machine.next_line()
1506 except EOFError: # end of input lines
1507 self.state_machine.previous_line()
1508 return 1
1509 else:
1510 self.state_machine.previous_line()
1511 if not next_line[:1].strip(): # blank or indented
1512 return 1
1513 result = self.make_enumerator(ordinal + 1, sequence, format)
1514 if result:
1515 next_enumerator, auto_enumerator = result
1516 try:
1517 if next_line.startswith((next_enumerator, auto_enumerator)):
1518 return 1
1519 except TypeError:
1520 pass
1521 return None
1522
1523 def make_enumerator(self, ordinal, sequence, format):
1524 """
1525 Construct and return the next enumerated list item marker, and an
1526 auto-enumerator ("#" instead of the regular enumerator).
1527
1528 Return ``None`` for invalid (out of range) ordinals.
1529 """
1530 if sequence == '#':
1531 enumerator = '#'
1532 elif sequence == 'arabic':
1533 enumerator = str(ordinal)
1534 else:
1535 if sequence.endswith('alpha'):
1536 if ordinal > 26:
1537 return None
1538 enumerator = chr(ordinal + ord('a') - 1)
1539 elif sequence.endswith('roman'):
1540 try:
1541 enumerator = RomanNumeral(ordinal).to_uppercase()
1542 except TypeError:
1543 return None
1544 else: # shouldn't happen
1545 raise ParserError('unknown enumerator sequence: "%s"'
1546 % sequence)
1547 if sequence.startswith('lower'):
1548 enumerator = enumerator.lower()
1549 elif sequence.startswith('upper'):
1550 enumerator = enumerator.upper()
1551 else: # shouldn't happen
1552 raise ParserError('unknown enumerator sequence: "%s"'
1553 % sequence)
1554 formatinfo = self.enum.formatinfo[format]
1555 next_enumerator = (formatinfo.prefix + enumerator + formatinfo.suffix
1556 + ' ')
1557 auto_enumerator = formatinfo.prefix + '#' + formatinfo.suffix + ' '
1558 return next_enumerator, auto_enumerator
1559
1560 def field_marker(self, match, context, next_state):
1561 """Field list item."""
1562 field_list = nodes.field_list()
1563 self.parent += field_list
1564 field, blank_finish = self.field(match)
1565 field_list += field
1566 offset = self.state_machine.line_offset + 1 # next line
1567 newline_offset, blank_finish = self.nested_list_parse(
1568 self.state_machine.input_lines[offset:],
1569 input_offset=self.state_machine.abs_line_offset() + 1,
1570 node=field_list, initial_state='FieldList',
1571 blank_finish=blank_finish)
1572 self.goto_line(newline_offset)
1573 if not blank_finish:
1574 self.parent += self.unindent_warning('Field list')
1575 return [], next_state, []
1576
1577 def field(self, match):
1578 name = self.parse_field_marker(match)
1579 src, srcline = self.state_machine.get_source_and_line()
1580 lineno = self.state_machine.abs_line_number()
1581 (indented, indent, line_offset, blank_finish
1582 ) = self.state_machine.get_first_known_indented(match.end())
1583 field_node = nodes.field()
1584 field_node.source = src
1585 field_node.line = srcline
1586 name_nodes, name_messages = self.inline_text(name, lineno)
1587 field_node += nodes.field_name(name, '', *name_nodes)
1588 field_body = nodes.field_body('\n'.join(indented), *name_messages)
1589 field_node += field_body
1590 if indented:
1591 self.parse_field_body(indented, line_offset, field_body)
1592 return field_node, blank_finish
1593
1594 def parse_field_marker(self, match):
1595 """Extract & return field name from a field marker match."""
1596 field = match.group()[1:] # strip off leading ':'
1597 field = field[:field.rfind(':')] # strip off trailing ':' etc.
1598 return field
1599
1600 def parse_field_body(self, indented, offset, node) -> None:
1601 self.nested_parse(indented, input_offset=offset, node=node)
1602
1603 def option_marker(self, match, context, next_state):
1604 """Option list item."""
1605 optionlist = nodes.option_list()
1606 (optionlist.source, optionlist.line
1607 ) = self.state_machine.get_source_and_line()
1608 try:
1609 listitem, blank_finish = self.option_list_item(match)
1610 except MarkupError as error:
1611 # This shouldn't happen; pattern won't match.
1612 msg = self.reporter.error('Invalid option list marker: %s'
1613 % error)
1614 self.parent += msg
1615 (indented, indent, line_offset, blank_finish
1616 ) = self.state_machine.get_first_known_indented(match.end())
1617 elements = self.block_quote(indented, line_offset)
1618 self.parent += elements
1619 if not blank_finish:
1620 self.parent += self.unindent_warning('Option list')
1621 return [], next_state, []
1622 self.parent += optionlist
1623 optionlist += listitem
1624 offset = self.state_machine.line_offset + 1 # next line
1625 newline_offset, blank_finish = self.nested_list_parse(
1626 self.state_machine.input_lines[offset:],
1627 input_offset=self.state_machine.abs_line_offset() + 1,
1628 node=optionlist, initial_state='OptionList',
1629 blank_finish=blank_finish)
1630 self.goto_line(newline_offset)
1631 if not blank_finish:
1632 self.parent += self.unindent_warning('Option list')
1633 return [], next_state, []
1634
1635 def option_list_item(self, match):
1636 offset = self.state_machine.abs_line_offset()
1637 options = self.parse_option_marker(match)
1638 (indented, indent, line_offset, blank_finish
1639 ) = self.state_machine.get_first_known_indented(match.end())
1640 if not indented: # not an option list item
1641 self.goto_line(offset)
1642 raise statemachine.TransitionCorrection('text')
1643 option_group = nodes.option_group('', *options)
1644 description = nodes.description('\n'.join(indented))
1645 option_list_item = nodes.option_list_item('', option_group,
1646 description)
1647 if indented:
1648 self.nested_parse(indented, input_offset=line_offset,
1649 node=description)
1650 return option_list_item, blank_finish
1651
1652 def parse_option_marker(self, match):
1653 """
1654 Return a list of `node.option` and `node.option_argument` objects,
1655 parsed from an option marker match.
1656
1657 :Exception: `MarkupError` for invalid option markers.
1658 """
1659 optlist = []
1660 # split at ", ", except inside < > (complex arguments)
1661 optionstrings = re.split(r', (?![^<]*>)', match.group().rstrip())
1662 for optionstring in optionstrings:
1663 tokens = optionstring.split()
1664 delimiter = ' '
1665 firstopt = tokens[0].split('=', 1)
1666 if len(firstopt) > 1:
1667 # "--opt=value" form
1668 tokens[:1] = firstopt
1669 delimiter = '='
1670 elif (len(tokens[0]) > 2
1671 and ((tokens[0].startswith('-')
1672 and not tokens[0].startswith('--'))
1673 or tokens[0].startswith('+'))):
1674 # "-ovalue" form
1675 tokens[:1] = [tokens[0][:2], tokens[0][2:]]
1676 delimiter = ''
1677 if len(tokens) > 1 and (tokens[1].startswith('<')
1678 and tokens[-1].endswith('>')):
1679 # "-o <value1 value2>" form; join all values into one token
1680 tokens[1:] = [' '.join(tokens[1:])]
1681 if 0 < len(tokens) <= 2:
1682 option = nodes.option(optionstring)
1683 option += nodes.option_string(tokens[0], tokens[0])
1684 if len(tokens) > 1:
1685 option += nodes.option_argument(tokens[1], tokens[1],
1686 delimiter=delimiter)
1687 optlist.append(option)
1688 else:
1689 raise MarkupError(
1690 'wrong number of option tokens (=%s), should be 1 or 2: '
1691 '"%s"' % (len(tokens), optionstring))
1692 return optlist
1693
1694 def doctest(self, match, context, next_state):
1695 line = self.document.current_line
1696 data = '\n'.join(self.state_machine.get_text_block())
1697 # TODO: Parse with `directives.body.CodeBlock` with
1698 # argument 'pycon' (Python Console) in Docutils 1.0.
1699 n = nodes.doctest_block(data, data)
1700 n.line = line
1701 self.parent += n
1702 return [], next_state, []
1703
1704 def line_block(self, match, context, next_state):
1705 """First line of a line block."""
1706 block = nodes.line_block()
1707 self.parent += block
1708 lineno = self.state_machine.abs_line_number()
1709 (block.source,
1710 block.line) = self.state_machine.get_source_and_line(lineno)
1711 line, messages, blank_finish = self.line_block_line(match, lineno)
1712 block += line
1713 self.parent += messages
1714 if not blank_finish:
1715 offset = self.state_machine.line_offset + 1 # next line
1716 new_line_offset, blank_finish = self.nested_list_parse(
1717 self.state_machine.input_lines[offset:],
1718 input_offset=self.state_machine.abs_line_offset() + 1,
1719 node=block, initial_state='LineBlock',
1720 blank_finish=False)
1721 self.goto_line(new_line_offset)
1722 if not blank_finish:
1723 self.parent += self.reporter.warning(
1724 'Line block ends without a blank line.',
1725 line=lineno+1)
1726 if len(block):
1727 if block[0].indent is None:
1728 block[0].indent = 0
1729 self.nest_line_block_lines(block)
1730 return [], next_state, []
1731
1732 def line_block_line(self, match, lineno):
1733 """Return one line element of a line_block."""
1734 (indented, indent, line_offset, blank_finish
1735 ) = self.state_machine.get_first_known_indented(match.end(),
1736 until_blank=True)
1737 text = '\n'.join(indented)
1738 text_nodes, messages = self.inline_text(text, lineno)
1739 line = nodes.line(text, '', *text_nodes)
1740 (line.source,
1741 line.line) = self.state_machine.get_source_and_line(lineno)
1742 if match.string.rstrip() != '|': # not empty
1743 line.indent = len(match.group(1)) - 1
1744 return line, messages, blank_finish
1745
1746 def nest_line_block_lines(self, block) -> None:
1747 for index in range(1, len(block)):
1748 if block[index].indent is None:
1749 block[index].indent = block[index - 1].indent
1750 self.nest_line_block_segment(block)
1751
1752 def nest_line_block_segment(self, block) -> None:
1753 indents = [item.indent for item in block]
1754 least = min(indents)
1755 new_items = []
1756 new_block = nodes.line_block()
1757 for item in block:
1758 if item.indent > least:
1759 new_block.append(item)
1760 else:
1761 if len(new_block):
1762 self.nest_line_block_segment(new_block)
1763 new_items.append(new_block)
1764 new_block = nodes.line_block()
1765 new_items.append(item)
1766 if len(new_block):
1767 self.nest_line_block_segment(new_block)
1768 new_items.append(new_block)
1769 block[:] = new_items
1770
1771 def grid_table_top(self, match, context, next_state):
1772 """Top border of a full table."""
1773 return self.table_top(match, context, next_state,
1774 self.isolate_grid_table,
1775 tableparser.GridTableParser)
1776
1777 def simple_table_top(self, match, context, next_state):
1778 """Top border of a simple table."""
1779 return self.table_top(match, context, next_state,
1780 self.isolate_simple_table,
1781 tableparser.SimpleTableParser)
1782
1783 def table_top(self, match, context, next_state,
1784 isolate_function, parser_class):
1785 """Top border of a generic table."""
1786 nodelist, blank_finish = self.table(isolate_function, parser_class)
1787 self.parent += nodelist
1788 if not blank_finish:
1789 msg = self.reporter.warning(
1790 'Blank line required after table.',
1791 line=self.state_machine.abs_line_number()+1)
1792 self.parent += msg
1793 return [], next_state, []
1794
1795 def table(self, isolate_function, parser_class):
1796 """Parse a table."""
1797 block, messages, blank_finish = isolate_function()
1798 if block:
1799 try:
1800 parser = parser_class()
1801 tabledata = parser.parse(block)
1802 tableline = (self.state_machine.abs_line_number() - len(block)
1803 + 1)
1804 table = self.build_table(tabledata, tableline)
1805 nodelist = [table] + messages
1806 except tableparser.TableMarkupError as err:
1807 nodelist = self.malformed_table(block, ' '.join(err.args),
1808 offset=err.offset) + messages
1809 else:
1810 nodelist = messages
1811 return nodelist, blank_finish
1812
1813 def isolate_grid_table(self):
1814 messages = []
1815 blank_finish = True
1816 try:
1817 block = self.state_machine.get_text_block(flush_left=True)
1818 except statemachine.UnexpectedIndentationError as err:
1819 block, src, srcline = err.args
1820 messages.append(self.reporter.error('Unexpected indentation.',
1821 source=src, line=srcline))
1822 blank_finish = False
1823 block.disconnect()
1824 # for East Asian chars:
1825 block.pad_double_width(self.double_width_pad_char)
1826 width = len(block[0].strip())
1827 for i in range(len(block)):
1828 block[i] = block[i].strip()
1829 if block[i][0] not in '+|': # check left edge
1830 blank_finish = False
1831 self.state_machine.previous_line(len(block) - i)
1832 del block[i:]
1833 break
1834 if not self.grid_table_top_pat.match(block[-1]): # find bottom
1835 # from second-last to third line of table:
1836 for i in range(len(block) - 2, 1, -1):
1837 if self.grid_table_top_pat.match(block[i]):
1838 self.state_machine.previous_line(len(block) - i + 1)
1839 del block[i+1:]
1840 blank_finish = False
1841 break
1842 else:
1843 detail = 'Bottom border missing or corrupt.'
1844 messages.extend(self.malformed_table(block, detail, i))
1845 return [], messages, blank_finish
1846 for i in range(len(block)): # check right edge
1847 if len(strip_combining_chars(block[i])
1848 ) != width or block[i][-1] not in '+|':
1849 detail = 'Right border not aligned or missing.'
1850 messages.extend(self.malformed_table(block, detail, i))
1851 return [], messages, blank_finish
1852 return block, messages, blank_finish
1853
1854 def isolate_simple_table(self):
1855 start = self.state_machine.line_offset
1856 lines = self.state_machine.input_lines
1857 limit = len(lines) - 1
1858 toplen = len(lines[start].strip())
1859 pattern_match = self.simple_table_border_pat.match
1860 found = 0
1861 found_at = None
1862 i = start + 1
1863 while i <= limit:
1864 line = lines[i]
1865 match = pattern_match(line)
1866 if match:
1867 if len(line.strip()) != toplen:
1868 self.state_machine.next_line(i - start)
1869 messages = self.malformed_table(
1870 lines[start:i+1], 'Bottom border or header rule does '
1871 'not match top border.', i-start)
1872 return [], messages, i == limit or not lines[i+1].strip()
1873 found += 1
1874 found_at = i
1875 if found == 2 or i == limit or not lines[i+1].strip():
1876 end = i
1877 break
1878 i += 1
1879 else: # reached end of input_lines
1880 details = 'No bottom table border found'
1881 if found:
1882 details += ' or no blank line after table bottom'
1883 self.state_machine.next_line(found_at - start)
1884 block = lines[start:found_at+1]
1885 else:
1886 self.state_machine.next_line(i - start - 1)
1887 block = lines[start:]
1888 messages = self.malformed_table(block, details + '.')
1889 return [], messages, not found
1890 self.state_machine.next_line(end - start)
1891 block = lines[start:end+1]
1892 # for East Asian chars:
1893 block.pad_double_width(self.double_width_pad_char)
1894 return block, [], end == limit or not lines[end+1].strip()
1895
1896 def malformed_table(self, block, detail='', offset=0):
1897 block.replace(self.double_width_pad_char, '')
1898 data = '\n'.join(block)
1899 message = 'Malformed table.'
1900 startline = self.state_machine.abs_line_number() - len(block) + 1
1901 if detail:
1902 message += '\n' + detail
1903 error = self.reporter.error(message, nodes.literal_block(data, data),
1904 line=startline+offset)
1905 return [error]
1906
1907 def build_table(self, tabledata, tableline, stub_columns=0, widths=None):
1908 colwidths, headrows, bodyrows = tabledata
1909 table = nodes.table()
1910 (table.source,
1911 table.line) = self.state_machine.get_source_and_line(tableline)
1912 if widths == 'auto':
1913 table['classes'] += ['colwidths-auto']
1914 elif widths: # "grid" or list of integers
1915 table['classes'] += ['colwidths-given']
1916 tgroup = nodes.tgroup(cols=len(colwidths))
1917 table += tgroup
1918 for colwidth in colwidths:
1919 colspec = nodes.colspec(colwidth=colwidth)
1920 if stub_columns:
1921 colspec.attributes['stub'] = True
1922 stub_columns -= 1
1923 tgroup += colspec
1924 if headrows:
1925 thead = nodes.thead()
1926 tgroup += thead
1927 for row in headrows:
1928 thead += self.build_table_row(row, tableline)
1929 tbody = nodes.tbody()
1930 tgroup += tbody
1931 for row in bodyrows:
1932 tbody += self.build_table_row(row, tableline)
1933 return table
1934
1935 def build_table_row(self, rowdata, tableline):
1936 row = nodes.row()
1937 for cell in rowdata:
1938 if cell is None:
1939 continue
1940 morerows, morecols, offset, cellblock = cell
1941 attributes = {}
1942 if morerows:
1943 attributes['morerows'] = morerows
1944 if morecols:
1945 attributes['morecols'] = morecols
1946 entry = nodes.entry(**attributes)
1947 row += entry
1948 if ''.join(cellblock):
1949 self.nested_parse(cellblock, input_offset=tableline+offset-1,
1950 node=entry)
1951 return row
1952
1953 explicit = Struct()
1954 """Patterns and constants used for explicit markup recognition."""
1955
1956 explicit.patterns = Struct(
1957 target=re.compile(r"""
1958 (
1959 _ # anonymous target
1960 | # *OR*
1961 (?!_) # no underscore at the beginning
1962 (?P<quote>`?) # optional open quote
1963 (?![ `]) # first char. not space or
1964 # backquote
1965 (?P<name> # reference name
1966 .+?
1967 )
1968 %(non_whitespace_escape_before)s
1969 (?P=quote) # close quote if open quote used
1970 )
1971 (?<!(?<!\x00):) # no unescaped colon at end
1972 %(non_whitespace_escape_before)s
1973 [ ]? # optional space
1974 : # end of reference name
1975 ([ ]+|$) # followed by whitespace
1976 """ % vars(Inliner), re.VERBOSE),
1977 reference=re.compile(r"""
1978 (
1979 (?P<simple>%(simplename)s)_
1980 | # *OR*
1981 ` # open backquote
1982 (?![ ]) # not space
1983 (?P<phrase>.+?) # hyperlink phrase
1984 %(non_whitespace_escape_before)s
1985 `_ # close backquote,
1986 # reference mark
1987 )
1988 $ # end of string
1989 """ % vars(Inliner), re.VERBOSE),
1990 substitution=re.compile(r"""
1991 (
1992 (?![ ]) # first char. not space
1993 (?P<name>.+?) # substitution text
1994 %(non_whitespace_escape_before)s
1995 \| # close delimiter
1996 )
1997 ([ ]+|$) # followed by whitespace
1998 """ % vars(Inliner),
1999 re.VERBOSE),)
2000
2001 def footnote(self, match):
2002 src, srcline = self.state_machine.get_source_and_line()
2003 (indented, indent, offset, blank_finish
2004 ) = self.state_machine.get_first_known_indented(match.end())
2005 label = match.group(1)
2006 name = normalize_name(label)
2007 footnote = nodes.footnote('\n'.join(indented))
2008 footnote.source = src
2009 footnote.line = srcline
2010 if name[0] == '#': # auto-numbered
2011 name = name[1:] # autonumber label
2012 footnote['auto'] = 1
2013 if name:
2014 footnote['names'].append(name)
2015 self.document.note_autofootnote(footnote)
2016 elif name == '*': # auto-symbol
2017 name = ''
2018 footnote['auto'] = '*'
2019 self.document.note_symbol_footnote(footnote)
2020 else: # manually numbered
2021 footnote += nodes.label('', label)
2022 footnote['names'].append(name)
2023 self.document.note_footnote(footnote)
2024 if name:
2025 self.document.note_explicit_target(footnote, footnote)
2026 else:
2027 self.document.set_id(footnote, footnote)
2028 if indented:
2029 self.nested_parse(indented, input_offset=offset, node=footnote)
2030 else:
2031 footnote += self.reporter.warning('Footnote content expected.')
2032 return [footnote], blank_finish
2033
2034 def citation(self, match):
2035 src, srcline = self.state_machine.get_source_and_line()
2036 (indented, indent, offset, blank_finish
2037 ) = self.state_machine.get_first_known_indented(match.end())
2038 label = match.group(1)
2039 name = normalize_name(label)
2040 citation = nodes.citation('\n'.join(indented))
2041 citation.source = src
2042 citation.line = srcline
2043 citation += nodes.label('', label)
2044 citation['names'].append(name)
2045 self.document.note_citation(citation)
2046 self.document.note_explicit_target(citation, citation)
2047 if indented:
2048 self.nested_parse(indented, input_offset=offset, node=citation)
2049 else:
2050 citation += self.reporter.warning('Citation content expected.')
2051 return [citation], blank_finish
2052
2053 def hyperlink_target(self, match):
2054 pattern = self.explicit.patterns.target
2055 lineno = self.state_machine.abs_line_number()
2056 (block, indent, offset, blank_finish
2057 ) = self.state_machine.get_first_known_indented(
2058 match.end(), until_blank=True, strip_indent=False)
2059 blocktext = match.string[:match.end()] + '\n'.join(block)
2060 block = [escape2null(line) for line in block]
2061 escaped = block[0]
2062 blockindex = 0
2063 while True:
2064 targetmatch = pattern.match(escaped)
2065 if targetmatch:
2066 break
2067 blockindex += 1
2068 try:
2069 escaped += block[blockindex]
2070 except IndexError:
2071 raise MarkupError('malformed hyperlink target.')
2072 del block[:blockindex]
2073 block[0] = (block[0] + ' ')[targetmatch.end()-len(escaped)-1:].strip()
2074 target = self.make_target(block, blocktext, lineno,
2075 targetmatch.group('name'))
2076 return [target], blank_finish
2077
2078 def make_target(self, block, block_text, lineno, target_name):
2079 target_type, data = self.parse_target(block, block_text, lineno)
2080 if target_type == 'refname':
2081 target = nodes.target(block_text, '', refname=normalize_name(data))
2082 self.add_target(target_name, '', target, lineno)
2083 self.document.note_indirect_target(target)
2084 return target
2085 elif target_type == 'refuri':
2086 target = nodes.target(block_text, '')
2087 self.add_target(target_name, data, target, lineno)
2088 return target
2089 else:
2090 return data
2091
2092 def parse_target(self, block, block_text, lineno):
2093 """
2094 Determine the type of reference of a target.
2095
2096 :Return: A 2-tuple, one of:
2097
2098 - 'refname' and the indirect reference name
2099 - 'refuri' and the URI
2100 - 'malformed' and a system_message node
2101 """
2102 if block and block[-1].strip()[-1:] == '_': # possible indirect target
2103 reference = ' '.join(line.strip() for line in block)
2104 refname = self.is_reference(reference)
2105 if refname:
2106 return 'refname', refname
2107 ref_parts = split_escaped_whitespace(' '.join(block))
2108 reference = ' '.join(''.join(unescape(part).split())
2109 for part in ref_parts)
2110 return 'refuri', reference
2111
2112 def is_reference(self, reference):
2113 match = self.explicit.patterns.reference.match(
2114 whitespace_normalize_name(reference))
2115 if not match:
2116 return None
2117 return unescape(match.group('simple') or match.group('phrase'))
2118
2119 def add_target(self, targetname, refuri, target, lineno):
2120 target.line = lineno
2121 if targetname:
2122 name = normalize_name(unescape(targetname))
2123 target['names'].append(name)
2124 if refuri:
2125 uri = self.inliner.adjust_uri(refuri)
2126 if uri:
2127 target['refuri'] = uri
2128 else:
2129 raise ApplicationError('problem with URI: %r' % refuri)
2130 self.document.note_explicit_target(target, self.parent)
2131 else: # anonymous target
2132 if refuri:
2133 target['refuri'] = refuri
2134 target['anonymous'] = True
2135 self.document.note_anonymous_target(target)
2136
2137 def substitution_def(self, match):
2138 pattern = self.explicit.patterns.substitution
2139 src, srcline = self.state_machine.get_source_and_line()
2140 (block, indent, offset, blank_finish
2141 ) = self.state_machine.get_first_known_indented(match.end(),
2142 strip_indent=False)
2143 blocktext = (match.string[:match.end()] + '\n'.join(block))
2144 block.disconnect()
2145 escaped = escape2null(block[0].rstrip())
2146 blockindex = 0
2147 while True:
2148 subdefmatch = pattern.match(escaped)
2149 if subdefmatch:
2150 break
2151 blockindex += 1
2152 try:
2153 escaped = escaped + ' ' + escape2null(
2154 block[blockindex].strip())
2155 except IndexError:
2156 raise MarkupError('malformed substitution definition.')
2157 del block[:blockindex] # strip out the substitution marker
2158 start = subdefmatch.end()-len(escaped)-1
2159 block[0] = (block[0].strip() + ' ')[start:-1]
2160 if not block[0]:
2161 del block[0]
2162 offset += 1
2163 while block and not block[-1].strip():
2164 block.pop()
2165 subname = subdefmatch.group('name')
2166 substitution_node = nodes.substitution_definition(blocktext)
2167 substitution_node.source = src
2168 substitution_node.line = srcline
2169 if not block:
2170 msg = self.reporter.warning(
2171 'Substitution definition "%s" missing contents.' % subname,
2172 nodes.literal_block(blocktext, blocktext),
2173 source=src, line=srcline)
2174 return [msg], blank_finish
2175 block[0] = block[0].strip()
2176 substitution_node['names'].append(
2177 nodes.whitespace_normalize_name(subname))
2178 new_abs_offset, blank_finish = self.nested_list_parse(
2179 block, input_offset=offset, node=substitution_node,
2180 initial_state='SubstitutionDef', blank_finish=blank_finish)
2181 i = 0
2182 for node in substitution_node[:]:
2183 if not (isinstance(node, nodes.Inline)
2184 or isinstance(node, nodes.Text)):
2185 self.parent += substitution_node[i]
2186 del substitution_node[i]
2187 else:
2188 i += 1
2189 for node in substitution_node.findall(nodes.Element,
2190 include_self=False):
2191 if isinstance(node, nodes.problematic):
2192 msg = self.reporter.error(
2193 'Problematic content in substitution definition',
2194 nodes.literal_block('', blocktext),
2195 source=src, line=srcline)
2196 msg.append(nodes.block_quote(
2197 '', nodes.paragraph('', '', *substitution_node.children)))
2198 return [msg], blank_finish
2199 illegal = self.disallowed_inside_substitution_definitions(node)
2200 if illegal:
2201 msg = self.reporter.error(f'{illegal} are not supported in '
2202 'a substitution definition.',
2203 nodes.literal_block('', blocktext),
2204 source=src, line=srcline)
2205 return [msg], blank_finish
2206 if len(substitution_node) == 0:
2207 msg = self.reporter.warning(
2208 'Substitution definition "%s" empty or invalid.' % subname,
2209 nodes.literal_block(blocktext, blocktext),
2210 source=src, line=srcline)
2211 return [msg], blank_finish
2212 self.document.note_substitution_def(
2213 substitution_node, subname, self.parent)
2214 return [substitution_node], blank_finish
2215
2216 def disallowed_inside_substitution_definitions(self, node) -> str:
2217 if isinstance(node, nodes.reference) and node.get('anonymous'):
2218 return 'Anonymous references'
2219 if isinstance(node, nodes.footnote_reference) and node.get('auto'):
2220 return 'References to auto-numbered and auto-symbol footnotes'
2221 if node['names'] or node['ids']:
2222 return 'Targets (names and identifiers)'
2223 else:
2224 return ''
2225
2226 def directive(self, match, **option_presets):
2227 """Returns a 2-tuple: list of nodes, and a "blank finish" boolean."""
2228 type_name = match.group(1)
2229 directive_class, messages = directives.directive(
2230 type_name, self.memo.language, self.document)
2231 self.parent += messages
2232 if directive_class:
2233 return self.run_directive(
2234 directive_class, match, type_name, option_presets)
2235 else:
2236 return self.unknown_directive(type_name)
2237
2238 def run_directive(self, directive, match, type_name, option_presets):
2239 """
2240 Parse a directive then run its directive function.
2241
2242 Parameters:
2243
2244 - `directive`: The class implementing the directive. Must be
2245 a subclass of `rst.Directive`.
2246
2247 - `match`: A regular expression match object which matched the first
2248 line of the directive.
2249
2250 - `type_name`: The directive name, as used in the source text.
2251
2252 - `option_presets`: A dictionary of preset options, defaults for the
2253 directive options. Currently, only an "alt" option is passed by
2254 substitution definitions (value: the substitution name), which may
2255 be used by an embedded image directive.
2256
2257 Returns a 2-tuple: list of nodes, and a "blank finish" boolean.
2258 """
2259 if isinstance(directive, (FunctionType, MethodType)):
2260 from docutils.parsers.rst import convert_directive_function
2261 directive = convert_directive_function(directive)
2262 lineno = self.state_machine.abs_line_number()
2263 initial_line_offset = self.state_machine.line_offset
2264 (indented, indent, line_offset, blank_finish
2265 ) = self.state_machine.get_first_known_indented(match.end(),
2266 strip_top=0)
2267 block_text = '\n'.join(self.state_machine.input_lines[
2268 initial_line_offset : self.state_machine.line_offset + 1]) # noqa: E203,E501
2269 try:
2270 arguments, options, content, content_offset = (
2271 self.parse_directive_block(indented, line_offset,
2272 directive, option_presets))
2273 except MarkupError as detail:
2274 error = self.reporter.error(
2275 'Error in "%s" directive:\n%s.' % (type_name,
2276 ' '.join(detail.args)),
2277 nodes.literal_block(block_text, block_text), line=lineno)
2278 return [error], blank_finish
2279 directive_instance = directive(
2280 type_name, arguments, options, content, lineno,
2281 content_offset, block_text, self, self.state_machine)
2282 try:
2283 result = directive_instance.run()
2284 except docutils.parsers.rst.DirectiveError as error:
2285 msg_node = self.reporter.system_message(error.level, error.msg,
2286 line=lineno)
2287 msg_node += nodes.literal_block(block_text, block_text)
2288 result = [msg_node]
2289 assert isinstance(result, list), \
2290 'Directive "%s" must return a list of nodes.' % type_name
2291 for i in range(len(result)):
2292 assert isinstance(result[i], nodes.Node), \
2293 ('Directive "%s" returned non-Node object (index %s): %r'
2294 % (type_name, i, result[i]))
2295 return (result,
2296 blank_finish or self.state_machine.is_next_line_blank())
2297
2298 def parse_directive_block(self, indented, line_offset, directive,
2299 option_presets):
2300 option_spec = directive.option_spec
2301 has_content = directive.has_content
2302 if indented and not indented[0].strip():
2303 indented.trim_start()
2304 line_offset += 1
2305 while indented and not indented[-1].strip():
2306 indented.trim_end()
2307 if indented and (directive.required_arguments
2308 or directive.optional_arguments
2309 or option_spec):
2310 for i, line in enumerate(indented):
2311 if not line.strip():
2312 break
2313 else:
2314 i += 1
2315 arg_block = indented[:i]
2316 content = indented[i+1:]
2317 content_offset = line_offset + i + 1
2318 else:
2319 content = indented
2320 content_offset = line_offset
2321 arg_block = []
2322 if option_spec:
2323 options, arg_block = self.parse_directive_options(
2324 option_presets, option_spec, arg_block)
2325 else:
2326 options = {}
2327 if arg_block and not (directive.required_arguments
2328 or directive.optional_arguments):
2329 content = arg_block + indented[i:]
2330 content_offset = line_offset
2331 arg_block = []
2332 while content and not content[0].strip():
2333 content.trim_start()
2334 content_offset += 1
2335 if directive.required_arguments or directive.optional_arguments:
2336 arguments = self.parse_directive_arguments(
2337 directive, arg_block)
2338 else:
2339 arguments = []
2340 if content and not has_content:
2341 raise MarkupError('no content permitted')
2342 return arguments, options, content, content_offset
2343
2344 def parse_directive_options(self, option_presets, option_spec, arg_block):
2345 options = option_presets.copy()
2346 for i, line in enumerate(arg_block):
2347 if re.match(Body.patterns['field_marker'], line):
2348 opt_block = arg_block[i:]
2349 arg_block = arg_block[:i]
2350 break
2351 else:
2352 opt_block = []
2353 if opt_block:
2354 success, data = self.parse_extension_options(option_spec,
2355 opt_block)
2356 if success: # data is a dict of options
2357 options.update(data)
2358 else: # data is an error string
2359 raise MarkupError(data)
2360 return options, arg_block
2361
2362 def parse_directive_arguments(self, directive, arg_block):
2363 required = directive.required_arguments
2364 optional = directive.optional_arguments
2365 arg_text = '\n'.join(arg_block)
2366 arguments = arg_text.split()
2367 if len(arguments) < required:
2368 raise MarkupError('%s argument(s) required, %s supplied'
2369 % (required, len(arguments)))
2370 elif len(arguments) > required + optional:
2371 if directive.final_argument_whitespace:
2372 arguments = arg_text.split(None, required + optional - 1)
2373 else:
2374 raise MarkupError(
2375 'maximum %s argument(s) allowed, %s supplied'
2376 % (required + optional, len(arguments)))
2377 return arguments
2378
2379 def parse_extension_options(self, option_spec, datalines):
2380 """
2381 Parse `datalines` for a field list containing extension options
2382 matching `option_spec`.
2383
2384 :Parameters:
2385 - `option_spec`: a mapping of option name to conversion
2386 function, which should raise an exception on bad input.
2387 - `datalines`: a list of input strings.
2388
2389 :Return:
2390 - Success value, 1 or 0.
2391 - An option dictionary on success, an error string on failure.
2392 """
2393 node = nodes.field_list()
2394 newline_offset, blank_finish = self.nested_list_parse(
2395 datalines, 0, node, initial_state='ExtensionOptions',
2396 blank_finish=True)
2397 if newline_offset != len(datalines): # incomplete parse of block
2398 return 0, 'invalid option block'
2399 try:
2400 options = utils.extract_extension_options(node, option_spec)
2401 except KeyError as detail:
2402 return 0, 'unknown option: "%s"' % detail.args[0]
2403 except (ValueError, TypeError) as detail:
2404 return 0, 'invalid option value: %s' % ' '.join(detail.args)
2405 except utils.ExtensionOptionError as detail:
2406 return 0, 'invalid option data: %s' % ' '.join(detail.args)
2407 if blank_finish:
2408 return 1, options
2409 else:
2410 return 0, 'option data incompletely parsed'
2411
2412 def unknown_directive(self, type_name):
2413 lineno = self.state_machine.abs_line_number()
2414 (indented, indent, offset, blank_finish
2415 ) = self.state_machine.get_first_known_indented(0, strip_indent=False)
2416 text = '\n'.join(indented)
2417 error = self.reporter.error('Unknown directive type "%s".' % type_name,
2418 nodes.literal_block(text, text),
2419 line=lineno)
2420 return [error], blank_finish
2421
2422 def comment(self, match):
2423 if self.state_machine.is_next_line_blank():
2424 first_comment_line = match.string[match.end():]
2425 if not first_comment_line.strip(): # empty comment
2426 return [nodes.comment()], True # "A tiny but practical wart."
2427 if first_comment_line.startswith('end of inclusion from "'):
2428 # cf. parsers.rst.directives.misc.Include
2429 self.document.include_log.pop()
2430 return [], True
2431 (indented, indent, offset, blank_finish
2432 ) = self.state_machine.get_first_known_indented(match.end())
2433 while indented and not indented[-1].strip():
2434 indented.trim_end()
2435 text = '\n'.join(indented)
2436 return [nodes.comment(text, text)], blank_finish
2437
2438 explicit.constructs = [
2439 (footnote,
2440 re.compile(r"""
2441 \.\.[ ]+ # explicit markup start
2442 \[
2443 ( # footnote label:
2444 [0-9]+ # manually numbered footnote
2445 | # *OR*
2446 \# # anonymous auto-numbered footnote
2447 | # *OR*
2448 \#%s # auto-number ed?) footnote label
2449 | # *OR*
2450 \* # auto-symbol footnote
2451 )
2452 \]
2453 ([ ]+|$) # whitespace or end of line
2454 """ % Inliner.simplename, re.VERBOSE)),
2455 (citation,
2456 re.compile(r"""
2457 \.\.[ ]+ # explicit markup start
2458 \[(%s)\] # citation label
2459 ([ ]+|$) # whitespace or end of line
2460 """ % Inliner.simplename, re.VERBOSE)),
2461 (hyperlink_target,
2462 re.compile(r"""
2463 \.\.[ ]+ # explicit markup start
2464 _ # target indicator
2465 (?![ ]|$) # first char. not space or EOL
2466 """, re.VERBOSE)),
2467 (substitution_def,
2468 re.compile(r"""
2469 \.\.[ ]+ # explicit markup start
2470 \| # substitution indicator
2471 (?![ ]|$) # first char. not space or EOL
2472 """, re.VERBOSE)),
2473 (directive,
2474 re.compile(r"""
2475 \.\.[ ]+ # explicit markup start
2476 (%s) # directive name
2477 [ ]? # optional space
2478 :: # directive delimiter
2479 ([ ]+|$) # whitespace or end of line
2480 """ % Inliner.simplename, re.VERBOSE))]
2481
2482 def explicit_markup(self, match, context, next_state):
2483 """Footnotes, hyperlink targets, directives, comments."""
2484 nodelist, blank_finish = self.explicit_construct(match)
2485 self.parent += nodelist
2486 self.explicit_list(blank_finish)
2487 return [], next_state, []
2488
2489 def explicit_construct(self, match):
2490 """Determine which explicit construct this is, parse & return it."""
2491 errors = []
2492 for method, pattern in self.explicit.constructs:
2493 expmatch = pattern.match(match.string)
2494 if expmatch:
2495 try:
2496 return method(self, expmatch)
2497 except MarkupError as error:
2498 lineno = self.state_machine.abs_line_number()
2499 message = ' '.join(error.args)
2500 errors.append(self.reporter.warning(message, line=lineno))
2501 break
2502 nodelist, blank_finish = self.comment(match)
2503 return nodelist + errors, blank_finish
2504
2505 def explicit_list(self, blank_finish) -> None:
2506 """
2507 Create a nested state machine for a series of explicit markup
2508 constructs (including anonymous hyperlink targets).
2509 """
2510 offset = self.state_machine.line_offset + 1 # next line
2511 newline_offset, blank_finish = self.nested_list_parse(
2512 self.state_machine.input_lines[offset:],
2513 input_offset=self.state_machine.abs_line_offset() + 1,
2514 node=self.parent, initial_state='Explicit',
2515 blank_finish=blank_finish)
2516 self.goto_line(newline_offset)
2517 if not blank_finish:
2518 self.parent += self.unindent_warning('Explicit markup')
2519
2520 def anonymous(self, match, context, next_state):
2521 """Anonymous hyperlink targets."""
2522 nodelist, blank_finish = self.anonymous_target(match)
2523 self.parent += nodelist
2524 self.explicit_list(blank_finish)
2525 return [], next_state, []
2526
2527 def anonymous_target(self, match):
2528 lineno = self.state_machine.abs_line_number()
2529 (block, indent, offset, blank_finish
2530 ) = self.state_machine.get_first_known_indented(match.end(),
2531 until_blank=True)
2532 blocktext = match.string[:match.end()] + '\n'.join(block)
2533 block = [escape2null(line) for line in block]
2534 target = self.make_target(block, blocktext, lineno, '')
2535 return [target], blank_finish
2536
2537 def line(self, match, context, next_state):
2538 """Section title overline or transition marker."""
2539 if self.state_machine.match_titles:
2540 return [match.string], 'Line', []
2541 elif match.string.strip() == '::':
2542 raise statemachine.TransitionCorrection('text')
2543 elif len(match.string.strip()) < 4:
2544 msg = self.reporter.info(
2545 'Unexpected possible title overline or transition.\n'
2546 "Treating it as ordinary text because it's so short.",
2547 line=self.state_machine.abs_line_number())
2548 self.parent += msg
2549 raise statemachine.TransitionCorrection('text')
2550 else:
2551 blocktext = self.state_machine.line
2552 msg = self.reporter.error(
2553 'Unexpected section title or transition.',
2554 nodes.literal_block(blocktext, blocktext),
2555 line=self.state_machine.abs_line_number())
2556 self.parent += msg
2557 return [], next_state, []
2558
2559 def text(self, match, context, next_state):
2560 """Titles, definition lists, paragraphs."""
2561 return [match.string], 'Text', []
2562
2563
2564class RFC2822Body(Body):
2565
2566 """
2567 RFC2822 headers are only valid as the first constructs in documents. As
2568 soon as anything else appears, the `Body` state should take over.
2569 """
2570
2571 patterns = Body.patterns.copy() # can't modify the original
2572 patterns['rfc2822'] = r'[!-9;-~]+:( +|$)'
2573 initial_transitions = [(name, 'Body')
2574 for name in Body.initial_transitions]
2575 initial_transitions.insert(-1, ('rfc2822', 'Body')) # just before 'text'
2576
2577 def rfc2822(self, match, context, next_state):
2578 """RFC2822-style field list item."""
2579 fieldlist = nodes.field_list(classes=['rfc2822'])
2580 self.parent += fieldlist
2581 field, blank_finish = self.rfc2822_field(match)
2582 fieldlist += field
2583 offset = self.state_machine.line_offset + 1 # next line
2584 newline_offset, blank_finish = self.nested_list_parse(
2585 self.state_machine.input_lines[offset:],
2586 input_offset=self.state_machine.abs_line_offset() + 1,
2587 node=fieldlist, initial_state='RFC2822List',
2588 blank_finish=blank_finish)
2589 self.goto_line(newline_offset)
2590 if not blank_finish:
2591 self.parent += self.unindent_warning(
2592 'RFC2822-style field list')
2593 return [], next_state, []
2594
2595 def rfc2822_field(self, match):
2596 name = match.string[:match.string.find(':')]
2597 (indented, indent, line_offset, blank_finish
2598 ) = self.state_machine.get_first_known_indented(match.end(),
2599 until_blank=True)
2600 fieldnode = nodes.field()
2601 fieldnode += nodes.field_name(name, name)
2602 fieldbody = nodes.field_body('\n'.join(indented))
2603 fieldnode += fieldbody
2604 if indented:
2605 self.nested_parse(indented, input_offset=line_offset,
2606 node=fieldbody)
2607 return fieldnode, blank_finish
2608
2609
2610class SpecializedBody(Body):
2611
2612 """
2613 Superclass for second and subsequent compound element members. Compound
2614 elements are lists and list-like constructs.
2615
2616 All transition methods are disabled (redefined as `invalid_input`).
2617 Override individual methods in subclasses to re-enable.
2618
2619 For example, once an initial bullet list item, say, is recognized, the
2620 `BulletList` subclass takes over, with a "bullet_list" node as its
2621 container. Upon encountering the initial bullet list item, `Body.bullet`
2622 calls its ``self.nested_list_parse`` (`RSTState.nested_list_parse`), which
2623 starts up a nested parsing session with `BulletList` as the initial state.
2624 Only the ``bullet`` transition method is enabled in `BulletList`; as long
2625 as only bullet list items are encountered, they are parsed and inserted
2626 into the container. The first construct which is *not* a bullet list item
2627 triggers the `invalid_input` method, which ends the nested parse and
2628 closes the container. `BulletList` needs to recognize input that is
2629 invalid in the context of a bullet list, which means everything *other
2630 than* bullet list items, so it inherits the transition list created in
2631 `Body`.
2632 """
2633
2634 def invalid_input(self, match=None, context=None, next_state=None):
2635 """Not a compound element member. Abort this state machine."""
2636 self.state_machine.previous_line() # back up so parent SM can reassess
2637 raise EOFError
2638
2639 indent = invalid_input
2640 bullet = invalid_input
2641 enumerator = invalid_input
2642 field_marker = invalid_input
2643 option_marker = invalid_input
2644 doctest = invalid_input
2645 line_block = invalid_input
2646 grid_table_top = invalid_input
2647 simple_table_top = invalid_input
2648 explicit_markup = invalid_input
2649 anonymous = invalid_input
2650 line = invalid_input
2651 text = invalid_input
2652
2653
2654class BulletList(SpecializedBody):
2655
2656 """Second and subsequent bullet_list list_items."""
2657
2658 def bullet(self, match, context, next_state):
2659 """Bullet list item."""
2660 if match.string[0] != self.parent['bullet']:
2661 # different bullet: new list
2662 self.invalid_input()
2663 listitem, blank_finish = self.list_item(match.end())
2664 self.parent += listitem
2665 self.blank_finish = blank_finish
2666 return [], next_state, []
2667
2668
2669class DefinitionList(SpecializedBody):
2670
2671 """Second and subsequent definition_list_items."""
2672
2673 def text(self, match, context, next_state):
2674 """Definition lists."""
2675 return [match.string], 'Definition', []
2676
2677
2678class EnumeratedList(SpecializedBody):
2679
2680 """Second and subsequent enumerated_list list_items."""
2681
2682 def enumerator(self, match, context, next_state):
2683 """Enumerated list item."""
2684 format, sequence, text, ordinal = self.parse_enumerator(
2685 match, self.parent['enumtype'])
2686 if (format != self.format
2687 or (sequence != '#' and (sequence != self.parent['enumtype']
2688 or self.auto
2689 or ordinal != (self.lastordinal + 1)))
2690 or not self.is_enumerated_list_item(ordinal, sequence, format)):
2691 # different enumeration: new list
2692 self.invalid_input()
2693 if sequence == '#':
2694 self.auto = 1
2695 listitem, blank_finish = self.list_item(match.end())
2696 self.parent += listitem
2697 self.blank_finish = blank_finish
2698 self.lastordinal = ordinal
2699 return [], next_state, []
2700
2701
2702class FieldList(SpecializedBody):
2703
2704 """Second and subsequent field_list fields."""
2705
2706 def field_marker(self, match, context, next_state):
2707 """Field list field."""
2708 field, blank_finish = self.field(match)
2709 self.parent += field
2710 self.blank_finish = blank_finish
2711 return [], next_state, []
2712
2713
2714class OptionList(SpecializedBody):
2715
2716 """Second and subsequent option_list option_list_items."""
2717
2718 def option_marker(self, match, context, next_state):
2719 """Option list item."""
2720 try:
2721 option_list_item, blank_finish = self.option_list_item(match)
2722 except MarkupError:
2723 self.invalid_input()
2724 self.parent += option_list_item
2725 self.blank_finish = blank_finish
2726 return [], next_state, []
2727
2728
2729class RFC2822List(SpecializedBody, RFC2822Body):
2730
2731 """Second and subsequent RFC2822-style field_list fields."""
2732
2733 patterns = RFC2822Body.patterns
2734 initial_transitions = RFC2822Body.initial_transitions
2735
2736 def rfc2822(self, match, context, next_state):
2737 """RFC2822-style field list item."""
2738 field, blank_finish = self.rfc2822_field(match)
2739 self.parent += field
2740 self.blank_finish = blank_finish
2741 return [], 'RFC2822List', []
2742
2743 blank = SpecializedBody.invalid_input
2744
2745
2746class ExtensionOptions(FieldList):
2747
2748 """
2749 Parse field_list fields for extension options.
2750
2751 No nested parsing is done (including inline markup parsing).
2752 """
2753
2754 def parse_field_body(self, indented, offset, node) -> None:
2755 """Override `Body.parse_field_body` for simpler parsing."""
2756 lines = []
2757 for line in list(indented) + ['']:
2758 if line.strip():
2759 lines.append(line)
2760 elif lines:
2761 text = '\n'.join(lines)
2762 node += nodes.paragraph(text, text)
2763 lines = []
2764
2765
2766class LineBlock(SpecializedBody):
2767
2768 """Second and subsequent lines of a line_block."""
2769
2770 blank = SpecializedBody.invalid_input
2771
2772 def line_block(self, match, context, next_state):
2773 """New line of line block."""
2774 lineno = self.state_machine.abs_line_number()
2775 line, messages, blank_finish = self.line_block_line(match, lineno)
2776 self.parent += line
2777 self.parent.parent += messages
2778 self.blank_finish = blank_finish
2779 return [], next_state, []
2780
2781
2782class Explicit(SpecializedBody):
2783
2784 """Second and subsequent explicit markup construct."""
2785
2786 def explicit_markup(self, match, context, next_state):
2787 """Footnotes, hyperlink targets, directives, comments."""
2788 nodelist, blank_finish = self.explicit_construct(match)
2789 self.parent += nodelist
2790 self.blank_finish = blank_finish
2791 return [], next_state, []
2792
2793 def anonymous(self, match, context, next_state):
2794 """Anonymous hyperlink targets."""
2795 nodelist, blank_finish = self.anonymous_target(match)
2796 self.parent += nodelist
2797 self.blank_finish = blank_finish
2798 return [], next_state, []
2799
2800 blank = SpecializedBody.invalid_input
2801
2802
2803class SubstitutionDef(Body):
2804
2805 """
2806 Parser for the contents of a substitution_definition element.
2807 """
2808
2809 patterns = {
2810 'embedded_directive': re.compile(r'(%s)::( +|$)'
2811 % Inliner.simplename),
2812 'text': r''}
2813 initial_transitions = ['embedded_directive', 'text']
2814
2815 def embedded_directive(self, match, context, next_state):
2816 nodelist, blank_finish = self.directive(match,
2817 alt=self.parent['names'][0])
2818 self.parent += nodelist
2819 if not self.state_machine.at_eof():
2820 self.blank_finish = blank_finish
2821 raise EOFError
2822
2823 def text(self, match, context, next_state):
2824 if not self.state_machine.at_eof():
2825 self.blank_finish = self.state_machine.is_next_line_blank()
2826 raise EOFError
2827
2828
2829class Text(RSTState):
2830
2831 """
2832 Classifier of second line of a text block.
2833
2834 Could be a paragraph, a definition list item, or a title.
2835 """
2836
2837 patterns = {'underline': Body.patterns['line'],
2838 'text': r''}
2839 initial_transitions = [('underline', 'Body'), ('text', 'Body')]
2840
2841 def blank(self, match, context, next_state):
2842 """End of paragraph."""
2843 # NOTE: self.paragraph returns [node, system_message(s)], literalnext
2844 paragraph, literalnext = self.paragraph(
2845 context, self.state_machine.abs_line_number() - 1)
2846 self.parent += paragraph
2847 if literalnext:
2848 self.parent += self.literal_block()
2849 return [], 'Body', []
2850
2851 def eof(self, context):
2852 if context:
2853 self.blank(None, context, None)
2854 return []
2855
2856 def indent(self, match, context, next_state):
2857 """Definition list item."""
2858 dl = nodes.definition_list()
2859 # the definition list starts on the line before the indent:
2860 lineno = self.state_machine.abs_line_number() - 1
2861 dl.source, dl.line = self.state_machine.get_source_and_line(lineno)
2862 dl_item, blank_finish = self.definition_list_item(context)
2863 dl += dl_item
2864 self.parent += dl
2865 offset = self.state_machine.line_offset + 1 # next line
2866 newline_offset, blank_finish = self.nested_list_parse(
2867 self.state_machine.input_lines[offset:],
2868 input_offset=self.state_machine.abs_line_offset() + 1,
2869 node=dl, initial_state='DefinitionList',
2870 blank_finish=blank_finish, blank_finish_state='Definition')
2871 self.goto_line(newline_offset)
2872 if not blank_finish:
2873 self.parent += self.unindent_warning('Definition list')
2874 return [], 'Body', []
2875
2876 def underline(self, match, context, next_state):
2877 """Section title."""
2878 lineno = self.state_machine.abs_line_number()
2879 title = context[0].rstrip()
2880 underline = match.string.rstrip()
2881 source = title + '\n' + underline
2882 messages = []
2883 if column_width(title) > len(underline):
2884 if len(underline) < 4:
2885 if self.state_machine.match_titles:
2886 msg = self.reporter.info(
2887 'Possible title underline, too short for the title.\n'
2888 "Treating it as ordinary text because it's so short.",
2889 line=lineno)
2890 self.parent += msg
2891 raise statemachine.TransitionCorrection('text')
2892 else:
2893 blocktext = context[0] + '\n' + self.state_machine.line
2894 msg = self.reporter.warning(
2895 'Title underline too short.',
2896 nodes.literal_block(blocktext, blocktext),
2897 line=lineno)
2898 messages.append(msg)
2899 if not self.state_machine.match_titles:
2900 blocktext = context[0] + '\n' + self.state_machine.line
2901 # We need get_source_and_line() here to report correctly
2902 src, srcline = self.state_machine.get_source_and_line()
2903 # TODO: why is abs_line_number() == srcline+1
2904 # if the error is in a table (try with test_tables.py)?
2905 # print("get_source_and_line", srcline)
2906 # print("abs_line_number", self.state_machine.abs_line_number())
2907 msg = self.reporter.error(
2908 'Unexpected section title.',
2909 nodes.literal_block(blocktext, blocktext),
2910 source=src, line=srcline)
2911 self.parent += messages
2912 self.parent += msg
2913 return [], next_state, []
2914 style = underline[0]
2915 context[:] = []
2916 self.section(title, source, style, lineno - 1, messages)
2917 return [], next_state, []
2918
2919 def text(self, match, context, next_state):
2920 """Paragraph."""
2921 startline = self.state_machine.abs_line_number() - 1
2922 msg = None
2923 try:
2924 block = self.state_machine.get_text_block(flush_left=True)
2925 except statemachine.UnexpectedIndentationError as err:
2926 block, src, srcline = err.args
2927 msg = self.reporter.error('Unexpected indentation.',
2928 source=src, line=srcline)
2929 lines = context + list(block)
2930 paragraph, literalnext = self.paragraph(lines, startline)
2931 self.parent += paragraph
2932 self.parent += msg
2933 if literalnext:
2934 try:
2935 self.state_machine.next_line()
2936 except EOFError:
2937 pass
2938 self.parent += self.literal_block()
2939 return [], next_state, []
2940
2941 def literal_block(self):
2942 """Return a list of nodes."""
2943 (indented, indent, offset, blank_finish
2944 ) = self.state_machine.get_indented()
2945 while indented and not indented[-1].strip():
2946 indented.trim_end()
2947 if not indented:
2948 return self.quoted_literal_block()
2949 data = '\n'.join(indented)
2950 literal_block = nodes.literal_block(data, data)
2951 (literal_block.source,
2952 literal_block.line) = self.state_machine.get_source_and_line(offset+1)
2953 nodelist = [literal_block]
2954 if not blank_finish:
2955 nodelist.append(self.unindent_warning('Literal block'))
2956 return nodelist
2957
2958 def quoted_literal_block(self):
2959 abs_line_offset = self.state_machine.abs_line_offset()
2960 offset = self.state_machine.line_offset
2961 parent_node = nodes.Element()
2962 new_abs_offset = self.nested_parse(
2963 self.state_machine.input_lines[offset:],
2964 input_offset=abs_line_offset, node=parent_node, match_titles=False,
2965 state_machine_kwargs={'state_classes': (QuotedLiteralBlock,),
2966 'initial_state': 'QuotedLiteralBlock'})
2967 self.goto_line(new_abs_offset)
2968 return parent_node.children
2969
2970 def definition_list_item(self, termline):
2971 # the parser is already on the second (indented) line:
2972 dd_lineno = self.state_machine.abs_line_number()
2973 dt_lineno = dd_lineno - 1
2974 (indented, indent, line_offset, blank_finish
2975 ) = self.state_machine.get_indented()
2976 dl_item = nodes.definition_list_item(
2977 '\n'.join(termline + list(indented)))
2978 (dl_item.source,
2979 dl_item.line) = self.state_machine.get_source_and_line(dt_lineno)
2980 dt_nodes, messages = self.term(termline, dt_lineno)
2981 dl_item += dt_nodes
2982 dd = nodes.definition('', *messages)
2983 dd.source, dd.line = self.state_machine.get_source_and_line(dd_lineno)
2984 dl_item += dd
2985 if termline[0][-2:] == '::':
2986 dd += self.reporter.info(
2987 'Blank line missing before literal block (after the "::")? '
2988 'Interpreted as a definition list item.',
2989 line=dd_lineno)
2990 # TODO: drop a definition if it is an empty comment to allow
2991 # definition list items with several terms?
2992 # https://sourceforge.net/p/docutils/feature-requests/60/
2993 self.nested_parse(indented, input_offset=line_offset, node=dd)
2994 return dl_item, blank_finish
2995
2996 classifier_delimiter = re.compile(' +: +')
2997
2998 def term(self, lines, lineno):
2999 """Return a definition_list's term and optional classifiers."""
3000 assert len(lines) == 1
3001 text_nodes, messages = self.inline_text(lines[0], lineno)
3002 dt = nodes.term(lines[0])
3003 dt.source, dt.line = self.state_machine.get_source_and_line(lineno)
3004 node_list = [dt]
3005 for i in range(len(text_nodes)):
3006 node = text_nodes[i]
3007 if isinstance(node, nodes.Text):
3008 parts = self.classifier_delimiter.split(node)
3009 if len(parts) == 1:
3010 node_list[-1] += node
3011 else:
3012 text = parts[0].rstrip()
3013 textnode = nodes.Text(text)
3014 node_list[-1] += textnode
3015 node_list += [nodes.classifier(unescape(part, True), part)
3016 for part in parts[1:]]
3017 else:
3018 node_list[-1] += node
3019 return node_list, messages
3020
3021
3022class SpecializedText(Text):
3023
3024 """
3025 Superclass for second and subsequent lines of Text-variants.
3026
3027 All transition methods are disabled. Override individual methods in
3028 subclasses to re-enable.
3029 """
3030
3031 def eof(self, context):
3032 """Incomplete construct."""
3033 return []
3034
3035 def invalid_input(self, match=None, context=None, next_state=None):
3036 """Not a compound element member. Abort this state machine."""
3037 raise EOFError
3038
3039 blank = invalid_input
3040 indent = invalid_input
3041 underline = invalid_input
3042 text = invalid_input
3043
3044
3045class Definition(SpecializedText):
3046
3047 """Second line of potential definition_list_item."""
3048
3049 def eof(self, context):
3050 """Not a definition."""
3051 self.state_machine.previous_line(2) # so parent SM can reassess
3052 return []
3053
3054 def indent(self, match, context, next_state):
3055 """Definition list item."""
3056 dl_item, blank_finish = self.definition_list_item(context)
3057 self.parent += dl_item
3058 self.blank_finish = blank_finish
3059 return [], 'DefinitionList', []
3060
3061
3062class Line(SpecializedText):
3063
3064 """
3065 Second line of over- & underlined section title or transition marker.
3066 """
3067
3068 eofcheck = 1 # ignored, will be removed in Docutils 2.0.
3069
3070 def eof(self, context):
3071 """Transition marker at end of section or document."""
3072 marker = context[0].strip()
3073 if len(marker) < 4:
3074 self.state_correction(context)
3075 src, srcline = self.state_machine.get_source_and_line()
3076 # lineno = self.state_machine.abs_line_number() - 1
3077 transition = nodes.transition(rawsource=context[0])
3078 transition.source = src
3079 transition.line = srcline - 1
3080 # transition.line = lineno
3081 self.parent += transition
3082 return []
3083
3084 def blank(self, match, context, next_state):
3085 """Transition marker."""
3086 src, srcline = self.state_machine.get_source_and_line()
3087 marker = context[0].strip()
3088 if len(marker) < 4:
3089 self.state_correction(context)
3090 transition = nodes.transition(rawsource=marker)
3091 transition.source = src
3092 transition.line = srcline - 1
3093 self.parent += transition
3094 return [], 'Body', []
3095
3096 def text(self, match, context, next_state):
3097 """Potential over- & underlined title."""
3098 lineno = self.state_machine.abs_line_number() - 1
3099 overline = context[0]
3100 title = match.string
3101 underline = ''
3102 try:
3103 underline = self.state_machine.next_line()
3104 except EOFError:
3105 blocktext = overline + '\n' + title
3106 if len(overline.rstrip()) < 4:
3107 self.short_overline(context, blocktext, lineno, 2)
3108 else:
3109 msg = self.reporter.error(
3110 'Incomplete section title.',
3111 nodes.literal_block(blocktext, blocktext),
3112 line=lineno)
3113 self.parent += msg
3114 return [], 'Body', []
3115 source = '%s\n%s\n%s' % (overline, title, underline)
3116 overline = overline.rstrip()
3117 underline = underline.rstrip()
3118 if not self.transitions['underline'][0].match(underline):
3119 blocktext = overline + '\n' + title + '\n' + underline
3120 if len(overline.rstrip()) < 4:
3121 self.short_overline(context, blocktext, lineno, 2)
3122 else:
3123 msg = self.reporter.error(
3124 'Missing matching underline for section title overline.',
3125 nodes.literal_block(source, source),
3126 line=lineno)
3127 self.parent += msg
3128 return [], 'Body', []
3129 elif overline != underline:
3130 blocktext = overline + '\n' + title + '\n' + underline
3131 if len(overline.rstrip()) < 4:
3132 self.short_overline(context, blocktext, lineno, 2)
3133 else:
3134 msg = self.reporter.error(
3135 'Title overline & underline mismatch.',
3136 nodes.literal_block(source, source),
3137 line=lineno)
3138 self.parent += msg
3139 return [], 'Body', []
3140 title = title.rstrip()
3141 messages = []
3142 if column_width(title) > len(overline):
3143 blocktext = overline + '\n' + title + '\n' + underline
3144 if len(overline.rstrip()) < 4:
3145 self.short_overline(context, blocktext, lineno, 2)
3146 else:
3147 msg = self.reporter.warning(
3148 'Title overline too short.',
3149 nodes.literal_block(source, source),
3150 line=lineno)
3151 messages.append(msg)
3152 style = (overline[0], underline[0])
3153 self.section(title.lstrip(), source, style, lineno + 1, messages)
3154 return [], 'Body', []
3155
3156 indent = text # indented title
3157
3158 def underline(self, match, context, next_state):
3159 overline = context[0]
3160 blocktext = overline + '\n' + self.state_machine.line
3161 lineno = self.state_machine.abs_line_number() - 1
3162 if len(overline.rstrip()) < 4:
3163 self.short_overline(context, blocktext, lineno, 1)
3164 msg = self.reporter.error(
3165 'Invalid section title or transition marker.',
3166 nodes.literal_block(blocktext, blocktext),
3167 line=lineno)
3168 self.parent += msg
3169 return [], 'Body', []
3170
3171 def short_overline(self, context, blocktext, lineno, lines=1) -> None:
3172 msg = self.reporter.info(
3173 'Possible incomplete section title.\nTreating the overline as '
3174 "ordinary text because it's so short.",
3175 line=lineno)
3176 self.parent += msg
3177 self.state_correction(context, lines)
3178
3179 def state_correction(self, context, lines=1):
3180 self.state_machine.previous_line(lines)
3181 context[:] = []
3182 raise statemachine.StateCorrection('Body', 'text')
3183
3184
3185class QuotedLiteralBlock(RSTState):
3186
3187 """
3188 Nested parse handler for quoted (unindented) literal blocks.
3189
3190 Special-purpose. Not for inclusion in `state_classes`.
3191 """
3192
3193 patterns = {'initial_quoted': r'(%(nonalphanum7bit)s)' % Body.pats,
3194 'text': r''}
3195 initial_transitions = ('initial_quoted', 'text')
3196
3197 def __init__(self, state_machine, debug=False) -> None:
3198 RSTState.__init__(self, state_machine, debug)
3199 self.messages = []
3200 self.initial_lineno = None
3201
3202 def blank(self, match, context, next_state):
3203 if context:
3204 raise EOFError
3205 else:
3206 return context, next_state, []
3207
3208 def eof(self, context):
3209 if context:
3210 src, srcline = self.state_machine.get_source_and_line(
3211 self.initial_lineno)
3212 text = '\n'.join(context)
3213 literal_block = nodes.literal_block(text, text)
3214 literal_block.source = src
3215 literal_block.line = srcline
3216 self.parent += literal_block
3217 else:
3218 self.parent += self.reporter.warning(
3219 'Literal block expected; none found.',
3220 line=self.state_machine.abs_line_number()
3221 ) # src not available, statemachine.input_lines is empty
3222 self.state_machine.previous_line()
3223 self.parent += self.messages
3224 return []
3225
3226 def indent(self, match, context, next_state):
3227 assert context, ('QuotedLiteralBlock.indent: context should not '
3228 'be empty!')
3229 self.messages.append(
3230 self.reporter.error('Unexpected indentation.',
3231 line=self.state_machine.abs_line_number()))
3232 self.state_machine.previous_line()
3233 raise EOFError
3234
3235 def initial_quoted(self, match, context, next_state):
3236 """Match arbitrary quote character on the first line only."""
3237 self.remove_transition('initial_quoted')
3238 quote = match.string[0]
3239 pattern = re.compile(re.escape(quote))
3240 # New transition matches consistent quotes only:
3241 self.add_transition('quoted',
3242 (pattern, self.quoted, self.__class__.__name__))
3243 self.initial_lineno = self.state_machine.abs_line_number()
3244 return [match.string], next_state, []
3245
3246 def quoted(self, match, context, next_state):
3247 """Match consistent quotes on subsequent lines."""
3248 context.append(match.string)
3249 return context, next_state, []
3250
3251 def text(self, match, context, next_state):
3252 if context:
3253 self.messages.append(
3254 self.reporter.error('Inconsistent literal block quoting.',
3255 line=self.state_machine.abs_line_number()))
3256 self.state_machine.previous_line()
3257 raise EOFError
3258
3259
3260state_classes = (Body, BulletList, DefinitionList, EnumeratedList, FieldList,
3261 OptionList, LineBlock, ExtensionOptions, Explicit, Text,
3262 Definition, Line, SubstitutionDef, RFC2822Body, RFC2822List)
3263"""Standard set of State classes used to start `RSTStateMachine`."""