1# $Id$
2# Author: David Goodger <goodger@python.org>
3# Copyright: This module has been placed in the public domain.
4
5"""
6This is the ``docutils.parsers.rst.states`` module, the core of
7the reStructuredText parser. It defines the following:
8
9:Classes:
10 - `RSTStateMachine`: reStructuredText parser's entry point.
11 - `NestedStateMachine`: recursive StateMachine.
12 - `RSTState`: reStructuredText State superclass.
13 - `Inliner`: For parsing inline markup.
14 - `Body`: Generic classifier of the first line of a block.
15 - `SpecializedBody`: Superclass for compound element members.
16 - `BulletList`: Second and subsequent bullet_list list_items
17 - `DefinitionList`: Second+ definition_list_items.
18 - `EnumeratedList`: Second+ enumerated_list list_items.
19 - `FieldList`: Second+ fields.
20 - `OptionList`: Second+ option_list_items.
21 - `RFC2822List`: Second+ RFC2822-style fields.
22 - `ExtensionOptions`: Parses directive option fields.
23 - `Explicit`: Second+ explicit markup constructs.
24 - `SubstitutionDef`: For embedded directives in substitution definitions.
25 - `Text`: Classifier of second line of a text block.
26 - `SpecializedText`: Superclass for continuation lines of Text-variants.
27 - `Definition`: Second line of potential definition_list_item.
28 - `Line`: Second line of overlined section title or transition marker.
29 - `Struct`: obsolete, use `types.SimpleNamespace`.
30
31:Exception classes:
32 - `MarkupError`
33 - `ParserError`
34 - `MarkupMismatch`
35
36:Functions:
37 - `escape2null()`: Return a string, escape-backslashes converted to nulls.
38 - `unescape()`: Return a string, nulls removed or restored to backslashes.
39
40:Attributes:
41 - `state_classes`: set of State classes used with `RSTStateMachine`.
42
43Parser Overview
44===============
45
46The reStructuredText parser is implemented as a recursive state machine,
47examining its input one line at a time. To understand how the parser works,
48please first become familiar with the `docutils.statemachine` module. In the
49description below, references are made to classes defined in this module;
50please see the individual classes for details.
51
52Parsing proceeds as follows:
53
541. The state machine examines each line of input, checking each of the
55 transition patterns of the state `Body`, in order, looking for a match.
56 The implicit transitions (blank lines and indentation) are checked before
57 any others. The 'text' transition is a catch-all (matches anything).
58
592. The method associated with the matched transition pattern is called.
60
61 A. Some transition methods are self-contained, appending elements to the
62 document tree (`Body.doctest` parses a doctest block). The parser's
63 current line index is advanced to the end of the element, and parsing
64 continues with step 1.
65
66 B. Other transition methods trigger the creation of a nested state machine,
67 whose job is to parse a compound construct ('indent' does a block quote,
68 'bullet' does a bullet list, 'overline' does a section [first checking
69 for a valid section header], etc.).
70
71 - In the case of lists and explicit markup, a one-off state machine is
72 created and run to parse contents of the first item.
73
74 - A new state machine is created and its initial state is set to the
75 appropriate specialized state (`BulletList` in the case of the
76 'bullet' transition; see `SpecializedBody` for more detail). This
77 state machine is run to parse the compound element (or series of
78 explicit markup elements), and returns as soon as a non-member element
79 is encountered. For example, the `BulletList` state machine ends as
80 soon as it encounters an element which is not a list item of that
81 bullet list. The optional omission of inter-element blank lines is
82 enabled by this nested state machine.
83
84 - The current line index is advanced to the end of the elements parsed,
85 and parsing continues with step 1.
86
87 C. The result of the 'text' transition depends on the next line of text.
88 The current state is changed to `Text`, under which the second line is
89 examined. If the second line is:
90
91 - Indented: The element is a definition list item, and parsing proceeds
92 similarly to step 2.B, using the `DefinitionList` state.
93
94 - A line of uniform punctuation characters: The element is a section
95 header; again, parsing proceeds as in step 2.B, and `Body` is still
96 used.
97
98 - Anything else: The element is a paragraph, which is examined for
99 inline markup and appended to the parent element. Processing
100 continues with step 1.
101"""
102
103from __future__ import annotations
104
105__docformat__ = 'reStructuredText'
106
107import re
108from types import FunctionType, MethodType
109from types import SimpleNamespace as Struct
110import warnings
111
112from docutils import nodes, statemachine, utils
113from docutils import ApplicationError, DataError
114from docutils.statemachine import StateMachineWS, StateWS
115from docutils.nodes import fully_normalize_name as normalize_name
116from docutils.nodes import unescape, whitespace_normalize_name
117import docutils.parsers.rst
118from docutils.parsers.rst import directives, languages, tableparser, roles
119from docutils.utils import escape2null, column_width, strip_combining_chars
120from docutils.utils import punctuation_chars, urischemes
121from docutils.utils import split_escaped_whitespace
122from docutils.utils._roman_numerals import (InvalidRomanNumeralError,
123 RomanNumeral)
124
125TYPE_CHECKING = False
126if TYPE_CHECKING:
127 from docutils.statemachine import StringList
128
129
130class MarkupError(DataError): pass
131class UnknownInterpretedRoleError(DataError): pass
132class InterpretedRoleNotImplementedError(DataError): pass
133class ParserError(ApplicationError): pass
134class MarkupMismatch(Exception): pass
135
136
137class RSTStateMachine(StateMachineWS):
138
139 """
140 reStructuredText's master StateMachine.
141
142 The entry point to reStructuredText parsing is the `run()` method.
143 """
144 section_level_offset: int = 0
145 """Correction term for section level determination in nested parsing.
146
147 Updated by `RSTState.nested_parse()` and used in
148 `RSTState.check_subsection()` to compensate differences when
149 nested parsing uses a detached base node with a document-wide
150 section title style hierarchy or the current node with a new,
151 independent title style hierarchy.
152 """
153
154 def run(self, input_lines, document, input_offset=0, match_titles=True,
155 inliner=None) -> None:
156 """
157 Parse `input_lines` and modify the `document` node in place.
158
159 Extend `StateMachineWS.run()`: set up parse-global data and
160 run the StateMachine.
161 """
162 self.language = languages.get_language(
163 document.settings.language_code, document.reporter)
164 self.match_titles = match_titles
165 if inliner is None:
166 inliner = Inliner()
167 inliner.init_customizations(document.settings)
168 # A collection of objects to share with nested parsers.
169 # The attributes `reporter`, `section_level`, and
170 # `section_bubble_up_kludge` will be removed in Docutils 2.0
171 self.memo = Struct(document=document,
172 reporter=document.reporter, # ignored
173 language=self.language,
174 title_styles=[],
175 section_level=0, # ignored
176 section_bubble_up_kludge=False, # ignored
177 inliner=inliner)
178 self.document = document
179 self.attach_observer(document.note_source)
180 self.reporter = self.document.reporter
181 self.node = document
182 results = StateMachineWS.run(self, input_lines, input_offset,
183 input_source=document['source'])
184 assert results == [], 'RSTStateMachine.run() results should be empty!'
185 self.node = self.memo = None # remove unneeded references
186
187
188class NestedStateMachine(RSTStateMachine):
189 """
190 StateMachine run from within other StateMachine runs, to parse nested
191 document structures.
192 """
193
194 def __init__(self, state_classes, initial_state,
195 debug=False, parent_state_machine=None) -> None:
196
197 self.parent_state_machine = parent_state_machine
198 """The instance of the parent state machine."""
199
200 super().__init__(state_classes, initial_state, debug)
201
202 def run(self, input_lines, input_offset, memo, node, match_titles=True):
203 """
204 Parse `input_lines` and populate `node`.
205
206 Extend `StateMachineWS.run()`: set up document-wide data.
207 """
208 self.match_titles = match_titles
209 self.memo = memo
210 self.document = memo.document
211 self.attach_observer(self.document.note_source)
212 self.language = memo.language
213 self.reporter = self.document.reporter
214 self.node = node
215 results = StateMachineWS.run(self, input_lines, input_offset)
216 assert results == [], ('NestedStateMachine.run() results should be '
217 'empty!')
218 return results
219
220
221class RSTState(StateWS):
222
223 """
224 reStructuredText State superclass.
225
226 Contains methods used by all State subclasses.
227 """
228
229 nested_sm = NestedStateMachine
230 nested_sm_cache = []
231
232 def __init__(self, state_machine: RSTStateMachine, debug=False) -> None:
233 self.nested_sm_kwargs = {'state_classes': state_classes,
234 'initial_state': 'Body'}
235 StateWS.__init__(self, state_machine, debug)
236
237 def runtime_init(self) -> None:
238 StateWS.runtime_init(self)
239 memo = self.state_machine.memo
240 self.memo = memo
241 self.document = memo.document
242 self.inliner = memo.inliner
243 self.reporter = self.document.reporter
244 # enable the reporter to determine source and source-line
245 if not hasattr(self.reporter, 'get_source_and_line'):
246 self.reporter.get_source_and_line = self.state_machine.get_source_and_line # noqa:E501
247
248 @property
249 def parent(self) -> nodes.Element | None:
250 return self.state_machine.node
251
252 @parent.setter
253 def parent(self, value: nodes.Element):
254 self.state_machine.node = value
255
256 def goto_line(self, abs_line_offset) -> None:
257 """
258 Jump to input line `abs_line_offset`, ignoring jumps past the end.
259 """
260 try:
261 self.state_machine.goto_line(abs_line_offset)
262 except EOFError:
263 pass
264
265 def no_match(self, context, transitions):
266 """
267 Override `StateWS.no_match` to generate a system message.
268
269 This code should never be run.
270 """
271 self.reporter.severe(
272 'Internal error: no transition pattern match. State: "%s"; '
273 'transitions: %s; context: %s; current line: %r.'
274 % (self.__class__.__name__, transitions, context,
275 self.state_machine.line))
276 return context, None, []
277
278 def bof(self, context):
279 """Called at beginning of file."""
280 return [], []
281
282 def nested_parse(self,
283 block: StringList,
284 input_offset: int,
285 node: nodes.Element|None = None,
286 match_titles: bool = False,
287 state_machine_class: StateMachineWS|None = None,
288 state_machine_kwargs: dict|None = None
289 ) -> int:
290 """
291 Parse the input `block` with a nested state-machine rooted at `node`.
292
293 :block:
294 reStructuredText source extract.
295 :input_offset:
296 Line number at start of the block.
297 :node:
298 Base node. Generated nodes will be appended to this node.
299 Default: the "current node" (`self.state_machine.node`).
300 :match_titles:
301 Allow section titles?
302 Caution: With a custom base node, this may lead to an invalid
303 or mixed up document tree. [#]_
304 :state_machine_class:
305 Default: `NestedStateMachine`.
306 :state_machine_kwargs:
307 Keyword arguments for the state-machine instantiation.
308 Default: `self.nested_sm_kwargs`.
309
310 Create a new state-machine instance if required.
311 Return new offset.
312
313 .. [#] See also ``test_parsers/test_rst/test_nested_parsing.py``
314 and Sphinx's `nested_parse_to_nodes()`__.
315
316 __ https://www.sphinx-doc.org/en/master/extdev/utils.html
317 #sphinx.util.parsing.nested_parse_to_nodes
318 """
319 if node is None:
320 node = self.state_machine.node
321 use_default = 0
322 if state_machine_class is None:
323 state_machine_class = self.nested_sm
324 use_default += 1
325 if state_machine_kwargs is None:
326 state_machine_kwargs = self.nested_sm_kwargs
327 use_default += 1
328 my_state_machine = None
329 if use_default == 2:
330 try:
331 # get cached state machine, prevent others from using it
332 my_state_machine = self.nested_sm_cache.pop()
333 except IndexError:
334 pass
335 if not my_state_machine:
336 my_state_machine = state_machine_class(
337 debug=self.debug,
338 parent_state_machine=self.state_machine,
339 **state_machine_kwargs)
340 # Check if we may use sections (with a caveat for custom nodes
341 # that may be dummies to collect children):
342 if (node == self.state_machine.node
343 and not isinstance(node, (nodes.document, nodes.section))):
344 match_titles = False # avoid invalid sections
345 if match_titles:
346 # Compensate mismatch of known title styles and number of
347 # parent sections of the base node if the document wide
348 # title styles are used with a detached base node or
349 # a new list of title styles with the current parent node:
350 l_node = len(node.section_hierarchy())
351 l_start = min(len(self.parent.section_hierarchy()),
352 len(self.memo.title_styles))
353 my_state_machine.section_level_offset = l_start - l_node
354
355 # run the state machine and populate `node`:
356 block_length = len(block)
357 my_state_machine.run(block, input_offset, self.memo,
358 node, match_titles)
359
360 if match_titles:
361 if node == self.state_machine.node:
362 # Pass on the new "current node" to parent state machines:
363 sm = self.state_machine
364 try:
365 while True:
366 sm.node = my_state_machine.node
367 sm = sm.parent_state_machine
368 except AttributeError:
369 pass
370 # clean up
371 new_offset = my_state_machine.abs_line_offset()
372 if use_default == 2:
373 self.nested_sm_cache.append(my_state_machine)
374 else:
375 my_state_machine.unlink()
376 # No `block.parent` implies disconnected -- lines aren't in sync:
377 if block.parent and (len(block) - block_length) != 0:
378 # Adjustment for block if modified in nested parse:
379 self.state_machine.next_line(len(block) - block_length)
380 return new_offset
381
382 def nested_list_parse(self, block, input_offset, node, initial_state,
383 blank_finish,
384 blank_finish_state=None,
385 extra_settings={},
386 match_titles=False, # deprecated, will be removed
387 state_machine_class=None,
388 state_machine_kwargs=None):
389 """
390 Parse the input `block` with a nested state-machine rooted at `node`.
391
392 Create a new StateMachine rooted at `node` and run it over the
393 input `block` (see also `nested_parse()`).
394 Also keep track of optional intermediate blank lines and the
395 required final one.
396
397 Return new offset and a boolean indicating whether there was a
398 blank final line.
399 """
400 if match_titles:
401 warnings.warn('The "match_titles" argument of '
402 'parsers.rst.states.RSTState.nested_list_parse() '
403 'will be ignored in Docutils 1.0 '
404 'and removed in Docutils 2.0.',
405 PendingDeprecationWarning, stacklevel=2)
406 if state_machine_class is None:
407 state_machine_class = self.nested_sm
408 if state_machine_kwargs is None:
409 state_machine_kwargs = self.nested_sm_kwargs.copy()
410 state_machine_kwargs['initial_state'] = initial_state
411 my_state_machine = state_machine_class(
412 debug=self.debug,
413 parent_state_machine=self.state_machine,
414 **state_machine_kwargs)
415 if blank_finish_state is None:
416 blank_finish_state = initial_state
417 my_state_machine.states[blank_finish_state].blank_finish = blank_finish
418 for key, value in extra_settings.items():
419 setattr(my_state_machine.states[initial_state], key, value)
420 my_state_machine.run(block, input_offset, memo=self.memo,
421 node=node, match_titles=match_titles)
422 blank_finish = my_state_machine.states[blank_finish_state].blank_finish
423 my_state_machine.unlink()
424 return my_state_machine.abs_line_offset(), blank_finish
425
426 def section(self, title, source, style, lineno, messages) -> None:
427 """Check for a valid subsection and create one if it checks out."""
428 if self.check_subsection(source, style, lineno):
429 self.new_subsection(title, lineno, messages)
430
431 def check_subsection(self, source, style, lineno) -> bool:
432 """
433 Check for a valid subsection header. Update section data in `memo`.
434
435 When a new section is reached that isn't a subsection of the current
436 section, set `self.parent` to the new section's parent section
437 (or the root node if the new section is a top-level section).
438 """
439 title_styles = self.memo.title_styles
440 parent_sections = self.parent.section_hierarchy()
441 # current section level: (0 root, 1 section, 2 subsection, ...)
442 oldlevel = (len(parent_sections)
443 + self.state_machine.section_level_offset)
444 # new section level:
445 try: # check for existing title style
446 newlevel = title_styles.index(style) + 1
447 except ValueError: # new title style
448 newlevel = len(title_styles) + 1
449 # The new level must not be deeper than an immediate child
450 # of the current level:
451 if newlevel > oldlevel + 1:
452 styles = ' '.join('/'.join(style) for style in title_styles)
453 self.parent += self.reporter.error(
454 'Inconsistent title style:'
455 f' skip from level {oldlevel} to {newlevel}.',
456 nodes.literal_block('', source),
457 nodes.paragraph('', f'Established title styles: {styles}'),
458 line=lineno)
459 return False
460 if newlevel <= oldlevel:
461 # new section is sibling or higher up in the section hierarchy
462 try:
463 new_parent = parent_sections[newlevel-oldlevel-1].parent
464 except IndexError:
465 styles = ' '.join('/'.join(style) for style in title_styles)
466 details = (f'The parent of level {newlevel} sections cannot'
467 ' be reached. The parser is at section level'
468 f' {oldlevel} but the current node has only'
469 f' {len(parent_sections)} parent section(s).'
470 '\nOne reason may be a high level'
471 ' section used in a directive that parses its'
472 ' content into a base node not attached to'
473 ' the document\n(up to Docutils 0.21,'
474 ' these sections were silently dropped).')
475 self.parent += self.reporter.error(
476 f'A level {newlevel} section cannot be used here.',
477 nodes.literal_block('', source),
478 nodes.paragraph('', f'Established title styles: {styles}'),
479 nodes.paragraph('', details),
480 line=lineno)
481 return False
482 self.parent = new_parent
483 self.memo.section_level = newlevel - 1
484 if newlevel > len(title_styles):
485 title_styles.append(style)
486 return True
487
488 def title_inconsistent(self, sourcetext, lineno):
489 # Ignored. Will be removed in Docutils 2.0.
490 error = self.reporter.error(
491 'Title level inconsistent:', nodes.literal_block('', sourcetext),
492 line=lineno)
493 return error
494
495 def new_subsection(self, title, lineno, messages):
496 """Append new subsection to document tree."""
497 section_node = nodes.section()
498 self.parent += section_node
499 textnodes, title_messages = self.inline_text(title, lineno)
500 titlenode = nodes.title(title, '', *textnodes)
501 name = normalize_name(titlenode.astext())
502 section_node['names'].append(name)
503 section_node += titlenode
504 section_node += messages
505 section_node += title_messages
506 self.document.note_implicit_target(section_node, section_node)
507 # Update state:
508 self.parent = section_node
509 self.memo.section_level += 1
510
511 def paragraph(self, lines, lineno):
512 """
513 Return a list (paragraph & messages) & a boolean: literal_block next?
514 """
515 data = '\n'.join(lines).rstrip()
516 if re.search(r'(?<!\\)(\\\\)*::$', data):
517 if len(data) == 2:
518 return [], 1
519 elif data[-3] in ' \n':
520 text = data[:-3].rstrip()
521 else:
522 text = data[:-1]
523 literalnext = 1
524 else:
525 text = data
526 literalnext = 0
527 textnodes, messages = self.inline_text(text, lineno)
528 p = nodes.paragraph(data, '', *textnodes)
529 p.source, p.line = self.state_machine.get_source_and_line(lineno)
530 return [p] + messages, literalnext
531
532 def inline_text(self, text, lineno):
533 """
534 Return 2 lists: nodes (text and inline elements), and system_messages.
535 """
536 nodes, messages = self.inliner.parse(text, lineno,
537 self.memo, self.parent)
538 return nodes, messages
539
540 def unindent_warning(self, node_name):
541 # the actual problem is one line below the current line
542 lineno = self.state_machine.abs_line_number() + 1
543 return self.reporter.warning('%s ends without a blank line; '
544 'unexpected unindent.' % node_name,
545 line=lineno)
546
547
548def build_regexp(definition, compile_patterns=True):
549 """
550 Build, compile and return a regular expression based on `definition`.
551
552 :Parameter: `definition`: a 4-tuple (group name, prefix, suffix, parts),
553 where "parts" is a list of regular expressions and/or regular
554 expression definitions to be joined into an or-group.
555 """
556 name, prefix, suffix, parts = definition
557 part_strings = []
558 for part in parts:
559 if isinstance(part, tuple):
560 part_strings.append(build_regexp(part, None))
561 else:
562 part_strings.append(part)
563 or_group = '|'.join(part_strings)
564 regexp = '%(prefix)s(?P<%(name)s>%(or_group)s)%(suffix)s' % locals()
565 if compile_patterns:
566 return re.compile(regexp)
567 else:
568 return regexp
569
570
571class Inliner:
572
573 """
574 Parse inline markup; call the `parse()` method.
575 """
576
577 def __init__(self) -> None:
578 self.implicit_dispatch = []
579 """List of (pattern, bound method) tuples, used by
580 `self.implicit_inline`."""
581
582 def init_customizations(self, settings) -> None:
583 # lookahead and look-behind expressions for inline markup rules
584 if getattr(settings, 'character_level_inline_markup', False):
585 start_string_prefix = '(^|(?<!\x00))'
586 end_string_suffix = ''
587 else:
588 start_string_prefix = ('(^|(?<=\\s|[%s%s]))' %
589 (punctuation_chars.openers,
590 punctuation_chars.delimiters))
591 end_string_suffix = ('($|(?=\\s|[\x00%s%s%s]))' %
592 (punctuation_chars.closing_delimiters,
593 punctuation_chars.delimiters,
594 punctuation_chars.closers))
595 args = locals().copy()
596 args.update(vars(self.__class__))
597
598 parts = ('initial_inline', start_string_prefix, '',
599 [
600 ('start', '', self.non_whitespace_after, # simple start-strings
601 [r'\*\*', # strong
602 r'\*(?!\*)', # emphasis but not strong
603 r'``', # literal
604 r'_`', # inline internal target
605 r'\|(?!\|)'] # substitution reference
606 ),
607 ('whole', '', end_string_suffix, # whole constructs
608 [ # reference name & end-string
609 r'(?P<refname>%s)(?P<refend>__?)' % self.simplename,
610 ('footnotelabel', r'\[', r'(?P<fnend>\]_)',
611 [r'[0-9]+', # manually numbered
612 r'\#(%s)?' % self.simplename, # auto-numbered (w/ label?)
613 r'\*', # auto-symbol
614 r'(?P<citationlabel>%s)' % self.simplename, # citation ref
615 ]
616 )
617 ]
618 ),
619 ('backquote', # interpreted text or phrase reference
620 '(?P<role>(:%s:)?)' % self.simplename, # optional role
621 self.non_whitespace_after,
622 ['`(?!`)'] # but not literal
623 )
624 ]
625 )
626 self.start_string_prefix = start_string_prefix
627 self.end_string_suffix = end_string_suffix
628 self.parts = parts
629
630 self.patterns = Struct(
631 initial=build_regexp(parts),
632 emphasis=re.compile(self.non_whitespace_escape_before
633 + r'(\*)' + end_string_suffix),
634 strong=re.compile(self.non_whitespace_escape_before
635 + r'(\*\*)' + end_string_suffix),
636 interpreted_or_phrase_ref=re.compile(
637 r"""
638 %(non_unescaped_whitespace_escape_before)s
639 (
640 `
641 (?P<suffix>
642 (?P<role>:%(simplename)s:)?
643 (?P<refend>__?)?
644 )
645 )
646 %(end_string_suffix)s
647 """ % args, re.VERBOSE),
648 embedded_link=re.compile(
649 r"""
650 (
651 (?:[ \n]+|^) # spaces or beginning of line/string
652 < # open bracket
653 %(non_whitespace_after)s
654 (([^<>]|\x00[<>])+) # anything but unescaped angle brackets
655 %(non_whitespace_escape_before)s
656 > # close bracket
657 )
658 $ # end of string
659 """ % args, re.VERBOSE),
660 literal=re.compile(self.non_whitespace_before + '(``)'
661 + end_string_suffix),
662 target=re.compile(self.non_whitespace_escape_before
663 + r'(`)' + end_string_suffix),
664 substitution_ref=re.compile(self.non_whitespace_escape_before
665 + r'(\|_{0,2})'
666 + end_string_suffix),
667 email=re.compile(self.email_pattern % args + '$',
668 re.VERBOSE),
669 uri=re.compile(
670 (r"""
671 %(start_string_prefix)s
672 (?P<whole>
673 (?P<absolute> # absolute URI
674 (?P<scheme> # scheme (http, ftp, mailto)
675 [a-zA-Z][a-zA-Z0-9.+-]*
676 )
677 :
678 (
679 ( # either:
680 (//?)? # hierarchical URI
681 %(uric)s* # URI characters
682 %(uri_end)s # final URI char
683 )
684 ( # optional query
685 \?%(uric)s*
686 %(uri_end)s
687 )?
688 ( # optional fragment
689 \#%(uric)s*
690 %(uri_end)s
691 )?
692 )
693 )
694 | # *OR*
695 (?P<email> # email address
696 """ + self.email_pattern + r"""
697 )
698 )
699 %(end_string_suffix)s
700 """) % args, re.VERBOSE),
701 pep=re.compile(
702 r"""
703 %(start_string_prefix)s
704 (
705 (pep-(?P<pepnum1>\d+)(.txt)?) # reference to source file
706 |
707 (PEP\s+(?P<pepnum2>\d+)) # reference by name
708 )
709 %(end_string_suffix)s""" % args, re.VERBOSE),
710 rfc=re.compile(
711 r"""
712 %(start_string_prefix)s
713 (RFC(-|\s+)?(?P<rfcnum>\d+))
714 %(end_string_suffix)s""" % args, re.VERBOSE))
715
716 self.implicit_dispatch.append((self.patterns.uri,
717 self.standalone_uri))
718 if settings.pep_references:
719 self.implicit_dispatch.append((self.patterns.pep,
720 self.pep_reference))
721 if settings.rfc_references:
722 self.implicit_dispatch.append((self.patterns.rfc,
723 self.rfc_reference))
724
725 def parse(self, text, lineno, memo, parent):
726 # Needs to be refactored for nested inline markup.
727 # Add nested_parse() method?
728 """
729 Return 2 lists: nodes (text and inline elements), and system_messages.
730
731 Using `self.patterns.initial`, a pattern which matches start-strings
732 (emphasis, strong, interpreted, phrase reference, literal,
733 substitution reference, and inline target) and complete constructs
734 (simple reference, footnote reference), search for a candidate. When
735 one is found, check for validity (e.g., not a quoted '*' character).
736 If valid, search for the corresponding end string if applicable, and
737 check it for validity. If not found or invalid, generate a warning
738 and ignore the start-string. Implicit inline markup (e.g. standalone
739 URIs) is found last.
740
741 :text: source string
742 :lineno: absolute line number, cf. `statemachine.get_source_and_line()`
743 """
744 self.document = memo.document
745 self.language = memo.language
746 self.reporter = self.document.reporter
747 self.parent = parent
748 pattern_search = self.patterns.initial.search
749 dispatch = self.dispatch
750 remaining = escape2null(text)
751 processed = []
752 unprocessed = []
753 messages = []
754 while remaining:
755 match = pattern_search(remaining)
756 if match:
757 groups = match.groupdict()
758 method = dispatch[groups['start'] or groups['backquote']
759 or groups['refend'] or groups['fnend']]
760 before, inlines, remaining, sysmessages = method(self, match,
761 lineno)
762 unprocessed.append(before)
763 messages += sysmessages
764 if inlines:
765 processed += self.implicit_inline(''.join(unprocessed),
766 lineno)
767 processed += inlines
768 unprocessed = []
769 else:
770 break
771 remaining = ''.join(unprocessed) + remaining
772 if remaining:
773 processed += self.implicit_inline(remaining, lineno)
774 return processed, messages
775
776 # Inline object recognition
777 # -------------------------
778 # See also init_customizations().
779 non_whitespace_before = r'(?<!\s)'
780 non_whitespace_escape_before = r'(?<![\s\x00])'
781 non_unescaped_whitespace_escape_before = r'(?<!(?<!\x00)[\s\x00])'
782 non_whitespace_after = r'(?!\s)'
783 # Alphanumerics with isolated internal [-._+:] chars (i.e. not 2 together):
784 simplename = r'(?:(?!_)\w)+(?:[-._+:](?:(?!_)\w)+)*'
785 # Valid URI characters (see RFC 2396 & RFC 2732);
786 # final \x00 allows backslash escapes in URIs:
787 uric = r"""[-_.!~*'()[\];/:@&=+$,%a-zA-Z0-9\x00]"""
788 # Delimiter indicating the end of a URI (not part of the URI):
789 uri_end_delim = r"""[>]"""
790 # Last URI character; same as uric but no punctuation:
791 urilast = r"""[_~*/=+a-zA-Z0-9]"""
792 # End of a URI (either 'urilast' or 'uric followed by a
793 # uri_end_delim'):
794 uri_end = r"""(?:%(urilast)s|%(uric)s(?=%(uri_end_delim)s))""" % locals()
795 emailc = r"""[-_!~*'{|}/#?^`&=+$%a-zA-Z0-9\x00]"""
796 email_pattern = r"""
797 %(emailc)s+(?:\.%(emailc)s+)* # name
798 (?<!\x00)@ # at
799 %(emailc)s+(?:\.%(emailc)s*)* # host
800 %(uri_end)s # final URI char
801 """
802
803 def quoted_start(self, match):
804 """Test if inline markup start-string is 'quoted'.
805
806 'Quoted' in this context means the start-string is enclosed in a pair
807 of matching opening/closing delimiters (not necessarily quotes)
808 or at the end of the match.
809 """
810 string = match.string
811 start = match.start()
812 if start == 0: # start-string at beginning of text
813 return False
814 prestart = string[start - 1]
815 try:
816 poststart = string[match.end()]
817 except IndexError: # start-string at end of text
818 return True # not "quoted" but no markup start-string either
819 return punctuation_chars.match_chars(prestart, poststart)
820
821 def inline_obj(self, match, lineno, end_pattern, nodeclass,
822 restore_backslashes=False):
823 string = match.string
824 matchstart = match.start('start')
825 matchend = match.end('start')
826 if self.quoted_start(match):
827 return string[:matchend], [], string[matchend:], [], ''
828 endmatch = end_pattern.search(string[matchend:])
829 if endmatch and endmatch.start(1): # 1 or more chars
830 text = endmatch.string[:endmatch.start(1)]
831 if restore_backslashes:
832 text = unescape(text, True)
833 textend = matchend + endmatch.end(1)
834 rawsource = unescape(string[matchstart:textend], True)
835 node = nodeclass(rawsource, text)
836 return (string[:matchstart], [node],
837 string[textend:], [], endmatch.group(1))
838 msg = self.reporter.warning(
839 'Inline %s start-string without end-string.'
840 % nodeclass.__name__, line=lineno)
841 text = unescape(string[matchstart:matchend], True)
842 prb = self.problematic(text, text, msg)
843 return string[:matchstart], [prb], string[matchend:], [msg], ''
844
845 def problematic(self, text, rawsource, message):
846 msgid = self.document.set_id(message, self.parent)
847 problematic = nodes.problematic(rawsource, text, refid=msgid)
848 prbid = self.document.set_id(problematic)
849 message.add_backref(prbid)
850 return problematic
851
852 def emphasis(self, match, lineno):
853 before, inlines, remaining, sysmessages, endstring = self.inline_obj(
854 match, lineno, self.patterns.emphasis, nodes.emphasis)
855 return before, inlines, remaining, sysmessages
856
857 def strong(self, match, lineno):
858 before, inlines, remaining, sysmessages, endstring = self.inline_obj(
859 match, lineno, self.patterns.strong, nodes.strong)
860 return before, inlines, remaining, sysmessages
861
862 def interpreted_or_phrase_ref(self, match, lineno):
863 end_pattern = self.patterns.interpreted_or_phrase_ref
864 string = match.string
865 matchstart = match.start('backquote')
866 matchend = match.end('backquote')
867 rolestart = match.start('role')
868 role = match.group('role')
869 position = ''
870 if role:
871 role = role[1:-1]
872 position = 'prefix'
873 elif self.quoted_start(match):
874 return string[:matchend], [], string[matchend:], []
875 endmatch = end_pattern.search(string[matchend:])
876 if endmatch and endmatch.start(1): # 1 or more chars
877 textend = matchend + endmatch.end()
878 if endmatch.group('role'):
879 if role:
880 msg = self.reporter.warning(
881 'Multiple roles in interpreted text (both '
882 'prefix and suffix present; only one allowed).',
883 line=lineno)
884 text = unescape(string[rolestart:textend], True)
885 prb = self.problematic(text, text, msg)
886 return string[:rolestart], [prb], string[textend:], [msg]
887 role = endmatch.group('suffix')[1:-1]
888 position = 'suffix'
889 escaped = endmatch.string[:endmatch.start(1)]
890 rawsource = unescape(string[matchstart:textend], True)
891 if rawsource[-1:] == '_':
892 if role:
893 msg = self.reporter.warning(
894 'Mismatch: both interpreted text role %s and '
895 'reference suffix.' % position, line=lineno)
896 text = unescape(string[rolestart:textend], True)
897 prb = self.problematic(text, text, msg)
898 return string[:rolestart], [prb], string[textend:], [msg]
899 return self.phrase_ref(string[:matchstart], string[textend:],
900 rawsource, escaped)
901 else:
902 rawsource = unescape(string[rolestart:textend], True)
903 nodelist, messages = self.interpreted(rawsource, escaped, role,
904 lineno)
905 return (string[:rolestart], nodelist,
906 string[textend:], messages)
907 msg = self.reporter.warning(
908 'Inline interpreted text or phrase reference start-string '
909 'without end-string.', line=lineno)
910 text = unescape(string[matchstart:matchend], True)
911 prb = self.problematic(text, text, msg)
912 return string[:matchstart], [prb], string[matchend:], [msg]
913
914 def phrase_ref(self, before, after, rawsource, escaped, text=None):
915 # `text` is ignored (since 0.16)
916 match = self.patterns.embedded_link.search(escaped)
917 if match: # embedded <URI> or <alias_>
918 text = escaped[:match.start(0)]
919 unescaped = unescape(text)
920 rawtext = unescape(text, True)
921 aliastext = match.group(2)
922 rawaliastext = unescape(aliastext, True)
923 underscore_escaped = rawaliastext.endswith(r'\_')
924 if (aliastext.endswith('_')
925 and not (underscore_escaped
926 or self.patterns.uri.match(aliastext))):
927 aliastype = 'name'
928 alias = normalize_name(unescape(aliastext[:-1]))
929 target = nodes.target(match.group(1), refname=alias)
930 target.indirect_reference_name = whitespace_normalize_name(
931 unescape(aliastext[:-1]))
932 else:
933 aliastype = 'uri'
934 # remove unescaped whitespace
935 alias_parts = split_escaped_whitespace(match.group(2))
936 alias = ' '.join(''.join(part.split())
937 for part in alias_parts)
938 alias = self.adjust_uri(unescape(alias))
939 if alias.endswith(r'\_'):
940 alias = alias[:-2] + '_'
941 target = nodes.target(match.group(1), refuri=alias)
942 target.referenced = 1
943 if not aliastext:
944 raise ApplicationError('problem with embedded link: %r'
945 % aliastext)
946 if not text:
947 text = alias
948 unescaped = unescape(text)
949 rawtext = rawaliastext
950 else:
951 text = escaped
952 unescaped = unescape(text)
953 target = None
954 rawtext = unescape(escaped, True)
955
956 refname = normalize_name(unescaped)
957 reference = nodes.reference(rawsource, text,
958 name=whitespace_normalize_name(unescaped))
959 reference[0].rawsource = rawtext
960
961 node_list = [reference]
962
963 if rawsource[-2:] == '__':
964 if target and (aliastype == 'name'):
965 reference['refname'] = alias
966 self.document.note_refname(reference)
967 # self.document.note_indirect_target(target) # required?
968 elif target and (aliastype == 'uri'):
969 reference['refuri'] = alias
970 else:
971 reference['anonymous'] = True
972 else:
973 if target:
974 target['names'].append(refname)
975 if aliastype == 'name':
976 reference['refname'] = alias
977 self.document.note_indirect_target(target)
978 self.document.note_refname(reference)
979 else:
980 reference['refuri'] = alias
981 # target.note_referenced_by(name=refname)
982 self.document.note_implicit_target(target, self.parent)
983 node_list.append(target)
984 else:
985 reference['refname'] = refname
986 self.document.note_refname(reference)
987 return before, node_list, after, []
988
989 def adjust_uri(self, uri):
990 match = self.patterns.email.match(uri)
991 if match:
992 return 'mailto:' + uri
993 else:
994 return uri
995
996 def interpreted(self, rawsource, text, role, lineno):
997 role_fn, messages = roles.role(role, self.language, lineno,
998 self.reporter)
999 if role_fn:
1000 nodes, messages2 = role_fn(role, rawsource, text, lineno, self)
1001 return nodes, messages + messages2
1002 else:
1003 msg = self.reporter.error(
1004 'Unknown interpreted text role "%s".' % role,
1005 line=lineno)
1006 return ([self.problematic(rawsource, rawsource, msg)],
1007 messages + [msg])
1008
1009 def literal(self, match, lineno):
1010 before, inlines, remaining, sysmessages, endstring = self.inline_obj(
1011 match, lineno, self.patterns.literal, nodes.literal,
1012 restore_backslashes=True)
1013 return before, inlines, remaining, sysmessages
1014
1015 def inline_internal_target(self, match, lineno):
1016 before, inlines, remaining, sysmessages, endstring = self.inline_obj(
1017 match, lineno, self.patterns.target, nodes.target)
1018 if inlines and isinstance(inlines[0], nodes.target):
1019 assert len(inlines) == 1
1020 target = inlines[0]
1021 name = normalize_name(target.astext())
1022 target['names'].append(name)
1023 self.document.note_explicit_target(target, self.parent)
1024 return before, inlines, remaining, sysmessages
1025
1026 def substitution_reference(self, match, lineno):
1027 before, inlines, remaining, sysmessages, endstring = self.inline_obj(
1028 match, lineno, self.patterns.substitution_ref,
1029 nodes.substitution_reference)
1030 if len(inlines) == 1:
1031 subref_node = inlines[0]
1032 if isinstance(subref_node, nodes.substitution_reference):
1033 subref_text = subref_node.astext()
1034 self.document.note_substitution_ref(subref_node, subref_text)
1035 if endstring[-1:] == '_':
1036 reference_node = nodes.reference(
1037 '|%s%s' % (subref_text, endstring), '')
1038 if endstring[-2:] == '__':
1039 reference_node['anonymous'] = True
1040 else:
1041 reference_node['refname'] = normalize_name(subref_text)
1042 self.document.note_refname(reference_node)
1043 reference_node += subref_node
1044 inlines = [reference_node]
1045 return before, inlines, remaining, sysmessages
1046
1047 def footnote_reference(self, match, lineno):
1048 """
1049 Handles `nodes.footnote_reference` and `nodes.citation_reference`
1050 elements.
1051 """
1052 label = match.group('footnotelabel')
1053 refname = normalize_name(label)
1054 string = match.string
1055 before = string[:match.start('whole')]
1056 remaining = string[match.end('whole'):]
1057 if match.group('citationlabel'):
1058 refnode = nodes.citation_reference('[%s]_' % label,
1059 refname=refname)
1060 refnode += nodes.Text(label)
1061 self.document.note_citation_ref(refnode)
1062 else:
1063 refnode = nodes.footnote_reference('[%s]_' % label)
1064 if refname[0] == '#':
1065 refname = refname[1:]
1066 refnode['auto'] = 1
1067 self.document.note_autofootnote_ref(refnode)
1068 elif refname == '*':
1069 refname = ''
1070 refnode['auto'] = '*'
1071 self.document.note_symbol_footnote_ref(
1072 refnode)
1073 else:
1074 refnode += nodes.Text(label)
1075 if refname:
1076 refnode['refname'] = refname
1077 self.document.note_footnote_ref(refnode)
1078 if utils.get_trim_footnote_ref_space(self.document.settings):
1079 before = before.rstrip()
1080 return before, [refnode], remaining, []
1081
1082 def reference(self, match, lineno, anonymous=False):
1083 referencename = match.group('refname')
1084 refname = normalize_name(referencename)
1085 referencenode = nodes.reference(
1086 referencename + match.group('refend'), referencename,
1087 name=whitespace_normalize_name(referencename))
1088 referencenode[0].rawsource = referencename
1089 if anonymous:
1090 referencenode['anonymous'] = True
1091 else:
1092 referencenode['refname'] = refname
1093 self.document.note_refname(referencenode)
1094 string = match.string
1095 matchstart = match.start('whole')
1096 matchend = match.end('whole')
1097 return string[:matchstart], [referencenode], string[matchend:], []
1098
1099 def anonymous_reference(self, match, lineno):
1100 return self.reference(match, lineno, anonymous=True)
1101
1102 def standalone_uri(self, match, lineno):
1103 if (not match.group('scheme')
1104 or match.group('scheme').lower() in urischemes.schemes):
1105 if match.group('email'):
1106 addscheme = 'mailto:'
1107 else:
1108 addscheme = ''
1109 text = match.group('whole')
1110 refuri = addscheme + unescape(text)
1111 reference = nodes.reference(unescape(text, True), text,
1112 refuri=refuri)
1113 return [reference]
1114 else: # not a valid scheme
1115 raise MarkupMismatch
1116
1117 def pep_reference(self, match, lineno):
1118 text = match.group(0)
1119 if text.startswith('pep-'):
1120 pepnum = int(unescape(match.group('pepnum1')))
1121 elif text.startswith('PEP'):
1122 pepnum = int(unescape(match.group('pepnum2')))
1123 else:
1124 raise MarkupMismatch
1125 ref = (self.document.settings.pep_base_url
1126 + self.document.settings.pep_file_url_template % pepnum)
1127 return [nodes.reference(unescape(text, True), text, refuri=ref)]
1128
1129 rfc_url = 'rfc%d.html'
1130
1131 def rfc_reference(self, match, lineno):
1132 text = match.group(0)
1133 if text.startswith('RFC'):
1134 rfcnum = int(unescape(match.group('rfcnum')))
1135 ref = self.document.settings.rfc_base_url + self.rfc_url % rfcnum
1136 else:
1137 raise MarkupMismatch
1138 return [nodes.reference(unescape(text, True), text, refuri=ref)]
1139
1140 def implicit_inline(self, text, lineno):
1141 """
1142 Check each of the patterns in `self.implicit_dispatch` for a match,
1143 and dispatch to the stored method for the pattern. Recursively check
1144 the text before and after the match. Return a list of `nodes.Text`
1145 and inline element nodes.
1146 """
1147 if not text:
1148 return []
1149 for pattern, method in self.implicit_dispatch:
1150 match = pattern.search(text)
1151 if match:
1152 try:
1153 # Must recurse on strings before *and* after the match;
1154 # there may be multiple patterns.
1155 return (self.implicit_inline(text[:match.start()], lineno)
1156 + method(match, lineno)
1157 + self.implicit_inline(text[match.end():], lineno))
1158 except MarkupMismatch:
1159 pass
1160 return [nodes.Text(text)]
1161
1162 dispatch = {'*': emphasis,
1163 '**': strong,
1164 '`': interpreted_or_phrase_ref,
1165 '``': literal,
1166 '_`': inline_internal_target,
1167 ']_': footnote_reference,
1168 '|': substitution_reference,
1169 '_': reference,
1170 '__': anonymous_reference}
1171
1172
1173def _loweralpha_to_int(s, _zero=(ord('a')-1)):
1174 return ord(s) - _zero
1175
1176
1177def _upperalpha_to_int(s, _zero=(ord('A')-1)):
1178 return ord(s) - _zero
1179
1180
1181class Body(RSTState):
1182
1183 """
1184 Generic classifier of the first line of a block.
1185 """
1186
1187 double_width_pad_char = tableparser.TableParser.double_width_pad_char
1188 """Padding character for East Asian double-width text."""
1189
1190 enum = Struct()
1191 """Enumerated list parsing information."""
1192
1193 enum.formatinfo = {
1194 'parens': Struct(prefix='(', suffix=')', start=1, end=-1),
1195 'rparen': Struct(prefix='', suffix=')', start=0, end=-1),
1196 'period': Struct(prefix='', suffix='.', start=0, end=-1)}
1197 enum.formats = enum.formatinfo.keys()
1198 enum.sequences = ['arabic', 'loweralpha', 'upperalpha',
1199 'lowerroman', 'upperroman'] # ORDERED!
1200 enum.sequencepats = {'arabic': '[0-9]+',
1201 'loweralpha': '[a-z]',
1202 'upperalpha': '[A-Z]',
1203 'lowerroman': '[ivxlcdm]+',
1204 'upperroman': '[IVXLCDM]+'}
1205 enum.converters = {'arabic': int,
1206 'loweralpha': _loweralpha_to_int,
1207 'upperalpha': _upperalpha_to_int,
1208 'lowerroman': RomanNumeral.from_string,
1209 'upperroman': RomanNumeral.from_string}
1210
1211 enum.sequenceregexps = {}
1212 for sequence in enum.sequences:
1213 enum.sequenceregexps[sequence] = re.compile(
1214 enum.sequencepats[sequence] + '$')
1215
1216 grid_table_top_pat = re.compile(r'\+-[-+]+-\+ *$')
1217 """Matches the top (& bottom) of a full table)."""
1218
1219 simple_table_top_pat = re.compile('=+( +=+)+ *$')
1220 """Matches the top of a simple table."""
1221
1222 simple_table_border_pat = re.compile('=+[ =]*$')
1223 """Matches the bottom & header bottom of a simple table."""
1224
1225 pats = {}
1226 """Fragments of patterns used by transitions."""
1227
1228 pats['nonalphanum7bit'] = '[!-/:-@[-`{-~]'
1229 pats['alpha'] = '[a-zA-Z]'
1230 pats['alphanum'] = '[a-zA-Z0-9]'
1231 pats['alphanumplus'] = '[a-zA-Z0-9_-]'
1232 pats['enum'] = ('(%(arabic)s|%(loweralpha)s|%(upperalpha)s|%(lowerroman)s'
1233 '|%(upperroman)s|#)' % enum.sequencepats)
1234 pats['optname'] = '%(alphanum)s%(alphanumplus)s*' % pats
1235 # @@@ Loosen up the pattern? Allow Unicode?
1236 pats['optarg'] = '(%(alpha)s%(alphanumplus)s*|<[^<>]+>)' % pats
1237 pats['shortopt'] = r'(-|\+)%(alphanum)s( ?%(optarg)s)?' % pats
1238 pats['longopt'] = r'(--|/)%(optname)s([ =]%(optarg)s)?' % pats
1239 pats['option'] = r'(%(shortopt)s|%(longopt)s)' % pats
1240
1241 for format in enum.formats:
1242 pats[format] = '(?P<%s>%s%s%s)' % (
1243 format, re.escape(enum.formatinfo[format].prefix),
1244 pats['enum'], re.escape(enum.formatinfo[format].suffix))
1245
1246 patterns = {
1247 'bullet': '[-+*\u2022\u2023\u2043]( +|$)',
1248 'enumerator': r'(%(parens)s|%(rparen)s|%(period)s)( +|$)' % pats,
1249 'field_marker': r':(?![: ])([^:\\]|\\.|:(?!([ `]|$)))*(?<! ):( +|$)',
1250 'option_marker': r'%(option)s(, %(option)s)*( +| ?$)' % pats,
1251 'doctest': r'>>>( +|$)',
1252 'line_block': r'\|( +|$)',
1253 'grid_table_top': grid_table_top_pat,
1254 'simple_table_top': simple_table_top_pat,
1255 'explicit_markup': r'\.\.( +|$)',
1256 'anonymous': r'__( +|$)',
1257 'line': r'(%(nonalphanum7bit)s)\1* *$' % pats,
1258 'text': r''}
1259 initial_transitions = (
1260 'bullet',
1261 'enumerator',
1262 'field_marker',
1263 'option_marker',
1264 'doctest',
1265 'line_block',
1266 'grid_table_top',
1267 'simple_table_top',
1268 'explicit_markup',
1269 'anonymous',
1270 'line',
1271 'text')
1272
1273 def indent(self, match, context, next_state):
1274 """Block quote."""
1275 (indented, indent, line_offset, blank_finish
1276 ) = self.state_machine.get_indented()
1277 elements = self.block_quote(indented, line_offset)
1278 self.parent += elements
1279 if not blank_finish:
1280 self.parent += self.unindent_warning('Block quote')
1281 return context, next_state, []
1282
1283 def block_quote(self, indented, line_offset):
1284 elements = []
1285 while indented:
1286 blockquote = nodes.block_quote(rawsource='\n'.join(indented))
1287 (blockquote.source, blockquote.line
1288 ) = self.state_machine.get_source_and_line(line_offset+1)
1289 (blockquote_lines,
1290 attribution_lines,
1291 attribution_offset,
1292 indented,
1293 new_line_offset) = self.split_attribution(indented, line_offset)
1294 self.nested_parse(blockquote_lines, line_offset, blockquote)
1295 elements.append(blockquote)
1296 if attribution_lines:
1297 attribution, messages = self.parse_attribution(
1298 attribution_lines, line_offset+attribution_offset)
1299 blockquote += attribution
1300 elements += messages
1301 line_offset = new_line_offset
1302 while indented and not indented[0]:
1303 indented = indented[1:]
1304 line_offset += 1
1305 return elements
1306
1307 # U+2014 is an em-dash:
1308 attribution_pattern = re.compile('(---?(?!-)|\u2014) *(?=[^ \\n])')
1309
1310 def split_attribution(self, indented, line_offset):
1311 """
1312 Check for a block quote attribution and split it off:
1313
1314 * First line after a blank line must begin with a dash ("--", "---",
1315 em-dash; matches `self.attribution_pattern`).
1316 * Every line after that must have consistent indentation.
1317 * Attributions must be preceded by block quote content.
1318
1319 Return a tuple of: (block quote content lines, attribution lines,
1320 attribution offset, remaining indented lines, remaining lines offset).
1321 """
1322 blank = None
1323 nonblank_seen = False
1324 for i in range(len(indented)):
1325 line = indented[i].rstrip()
1326 if line:
1327 if nonblank_seen and blank == i - 1: # last line blank
1328 match = self.attribution_pattern.match(line)
1329 if match:
1330 attribution_end, indent = self.check_attribution(
1331 indented, i)
1332 if attribution_end:
1333 a_lines = indented[i:attribution_end]
1334 a_lines.trim_left(match.end(), end=1)
1335 a_lines.trim_left(indent, start=1)
1336 return (indented[:i], a_lines,
1337 i, indented[attribution_end:],
1338 line_offset + attribution_end)
1339 nonblank_seen = True
1340 else:
1341 blank = i
1342 else:
1343 return indented, None, None, None, None
1344
1345 def check_attribution(self, indented, attribution_start):
1346 """
1347 Check attribution shape.
1348 Return the index past the end of the attribution, and the indent.
1349 """
1350 indent = None
1351 i = attribution_start + 1
1352 for i in range(attribution_start + 1, len(indented)):
1353 line = indented[i].rstrip()
1354 if not line:
1355 break
1356 if indent is None:
1357 indent = len(line) - len(line.lstrip())
1358 elif len(line) - len(line.lstrip()) != indent:
1359 return None, None # bad shape; not an attribution
1360 else:
1361 # return index of line after last attribution line:
1362 i += 1
1363 return i, (indent or 0)
1364
1365 def parse_attribution(self, indented, line_offset):
1366 text = '\n'.join(indented).rstrip()
1367 lineno = 1 + line_offset # line_offset is zero-based
1368 textnodes, messages = self.inline_text(text, lineno)
1369 node = nodes.attribution(text, '', *textnodes)
1370 node.source, node.line = self.state_machine.get_source_and_line(lineno)
1371 return node, messages
1372
1373 def bullet(self, match, context, next_state):
1374 """Bullet list item."""
1375 ul = nodes.bullet_list()
1376 ul.source, ul.line = self.state_machine.get_source_and_line()
1377 self.parent += ul
1378 ul['bullet'] = match.string[0]
1379 i, blank_finish = self.list_item(match.end())
1380 ul += i
1381 offset = self.state_machine.line_offset + 1 # next line
1382 new_line_offset, blank_finish = self.nested_list_parse(
1383 self.state_machine.input_lines[offset:],
1384 input_offset=self.state_machine.abs_line_offset() + 1,
1385 node=ul, initial_state='BulletList',
1386 blank_finish=blank_finish)
1387 self.goto_line(new_line_offset)
1388 if not blank_finish:
1389 self.parent += self.unindent_warning('Bullet list')
1390 return [], next_state, []
1391
1392 def list_item(self, indent):
1393 src, srcline = self.state_machine.get_source_and_line()
1394 if self.state_machine.line[indent:]:
1395 indented, line_offset, blank_finish = (
1396 self.state_machine.get_known_indented(indent))
1397 else:
1398 indented, indent, line_offset, blank_finish = (
1399 self.state_machine.get_first_known_indented(indent))
1400 listitem = nodes.list_item('\n'.join(indented))
1401 listitem.source, listitem.line = src, srcline
1402 if indented:
1403 self.nested_parse(indented, input_offset=line_offset,
1404 node=listitem)
1405 return listitem, blank_finish
1406
1407 def enumerator(self, match, context, next_state):
1408 """Enumerated List Item"""
1409 format, sequence, text, ordinal = self.parse_enumerator(match)
1410 if not self.is_enumerated_list_item(ordinal, sequence, format):
1411 raise statemachine.TransitionCorrection('text')
1412 enumlist = nodes.enumerated_list()
1413 (enumlist.source,
1414 enumlist.line) = self.state_machine.get_source_and_line()
1415 self.parent += enumlist
1416 if sequence == '#':
1417 enumlist['enumtype'] = 'arabic'
1418 else:
1419 enumlist['enumtype'] = sequence
1420 enumlist['prefix'] = self.enum.formatinfo[format].prefix
1421 enumlist['suffix'] = self.enum.formatinfo[format].suffix
1422 if ordinal != 1:
1423 enumlist['start'] = ordinal
1424 msg = self.reporter.info(
1425 'Enumerated list start value not ordinal-1: "%s" (ordinal %s)'
1426 % (text, ordinal), base_node=enumlist)
1427 self.parent += msg
1428 listitem, blank_finish = self.list_item(match.end())
1429 enumlist += listitem
1430 offset = self.state_machine.line_offset + 1 # next line
1431 newline_offset, blank_finish = self.nested_list_parse(
1432 self.state_machine.input_lines[offset:],
1433 input_offset=self.state_machine.abs_line_offset() + 1,
1434 node=enumlist, initial_state='EnumeratedList',
1435 blank_finish=blank_finish,
1436 extra_settings={'lastordinal': ordinal,
1437 'format': format,
1438 'auto': sequence == '#'})
1439 self.goto_line(newline_offset)
1440 if not blank_finish:
1441 self.parent += self.unindent_warning('Enumerated list')
1442 return [], next_state, []
1443
1444 def parse_enumerator(self, match, expected_sequence=None):
1445 """
1446 Analyze an enumerator and return the results.
1447
1448 :Return:
1449 - the enumerator format ('period', 'parens', or 'rparen'),
1450 - the sequence used ('arabic', 'loweralpha', 'upperroman', etc.),
1451 - the text of the enumerator, stripped of formatting, and
1452 - the ordinal value of the enumerator ('a' -> 1, 'ii' -> 2, etc.;
1453 ``None`` is returned for invalid enumerator text).
1454
1455 The enumerator format has already been determined by the regular
1456 expression match. If `expected_sequence` is given, that sequence is
1457 tried first. If not, we check for Roman numeral 1. This way,
1458 single-character Roman numerals (which are also alphabetical) can be
1459 matched. If no sequence has been matched, all sequences are checked in
1460 order.
1461 """
1462 groupdict = match.groupdict()
1463 sequence = ''
1464 for format in self.enum.formats:
1465 if groupdict[format]: # was this the format matched?
1466 break # yes; keep `format`
1467 else: # shouldn't happen
1468 raise ParserError('enumerator format not matched')
1469 text = groupdict[format][self.enum.formatinfo[format].start # noqa: E203,E501
1470 : self.enum.formatinfo[format].end]
1471 if text == '#':
1472 sequence = '#'
1473 elif expected_sequence:
1474 try:
1475 if self.enum.sequenceregexps[expected_sequence].match(text):
1476 sequence = expected_sequence
1477 except KeyError: # shouldn't happen
1478 raise ParserError('unknown enumerator sequence: %s'
1479 % sequence)
1480 elif text == 'i':
1481 sequence = 'lowerroman'
1482 elif text == 'I':
1483 sequence = 'upperroman'
1484 if not sequence:
1485 for sequence in self.enum.sequences:
1486 if self.enum.sequenceregexps[sequence].match(text):
1487 break
1488 else: # shouldn't happen
1489 raise ParserError('enumerator sequence not matched')
1490 if sequence == '#':
1491 ordinal = 1
1492 else:
1493 try:
1494 ordinal = int(self.enum.converters[sequence](text))
1495 except InvalidRomanNumeralError:
1496 ordinal = None
1497 return format, sequence, text, ordinal
1498
1499 def is_enumerated_list_item(self, ordinal, sequence, format):
1500 """
1501 Check validity based on the ordinal value and the second line.
1502
1503 Return true if the ordinal is valid and the second line is blank,
1504 indented, or starts with the next enumerator or an auto-enumerator.
1505 """
1506 if ordinal is None:
1507 return None
1508 try:
1509 next_line = self.state_machine.next_line()
1510 except EOFError: # end of input lines
1511 self.state_machine.previous_line()
1512 return 1
1513 else:
1514 self.state_machine.previous_line()
1515 if not next_line[:1].strip(): # blank or indented
1516 return 1
1517 result = self.make_enumerator(ordinal + 1, sequence, format)
1518 if result:
1519 next_enumerator, auto_enumerator = result
1520 try:
1521 if next_line.startswith((next_enumerator, auto_enumerator)):
1522 return 1
1523 except TypeError:
1524 pass
1525 return None
1526
1527 def make_enumerator(self, ordinal, sequence, format):
1528 """
1529 Construct and return the next enumerated list item marker, and an
1530 auto-enumerator ("#" instead of the regular enumerator).
1531
1532 Return ``None`` for invalid (out of range) ordinals.
1533 """
1534 if sequence == '#':
1535 enumerator = '#'
1536 elif sequence == 'arabic':
1537 enumerator = str(ordinal)
1538 else:
1539 if sequence.endswith('alpha'):
1540 if ordinal > 26:
1541 return None
1542 enumerator = chr(ordinal + ord('a') - 1)
1543 elif sequence.endswith('roman'):
1544 try:
1545 enumerator = RomanNumeral(ordinal).to_uppercase()
1546 except TypeError:
1547 return None
1548 else: # shouldn't happen
1549 raise ParserError('unknown enumerator sequence: "%s"'
1550 % sequence)
1551 if sequence.startswith('lower'):
1552 enumerator = enumerator.lower()
1553 elif sequence.startswith('upper'):
1554 enumerator = enumerator.upper()
1555 else: # shouldn't happen
1556 raise ParserError('unknown enumerator sequence: "%s"'
1557 % sequence)
1558 formatinfo = self.enum.formatinfo[format]
1559 next_enumerator = (formatinfo.prefix + enumerator + formatinfo.suffix
1560 + ' ')
1561 auto_enumerator = formatinfo.prefix + '#' + formatinfo.suffix + ' '
1562 return next_enumerator, auto_enumerator
1563
1564 def field_marker(self, match, context, next_state):
1565 """Field list item."""
1566 field_list = nodes.field_list()
1567 self.parent += field_list
1568 field, blank_finish = self.field(match)
1569 field_list += field
1570 offset = self.state_machine.line_offset + 1 # next line
1571 newline_offset, blank_finish = self.nested_list_parse(
1572 self.state_machine.input_lines[offset:],
1573 input_offset=self.state_machine.abs_line_offset() + 1,
1574 node=field_list, initial_state='FieldList',
1575 blank_finish=blank_finish)
1576 self.goto_line(newline_offset)
1577 if not blank_finish:
1578 self.parent += self.unindent_warning('Field list')
1579 return [], next_state, []
1580
1581 def field(self, match):
1582 name = self.parse_field_marker(match)
1583 src, srcline = self.state_machine.get_source_and_line()
1584 lineno = self.state_machine.abs_line_number()
1585 (indented, indent, line_offset, blank_finish
1586 ) = self.state_machine.get_first_known_indented(match.end())
1587 field_node = nodes.field()
1588 field_node.source = src
1589 field_node.line = srcline
1590 name_nodes, name_messages = self.inline_text(name, lineno)
1591 field_node += nodes.field_name(name, '', *name_nodes)
1592 field_body = nodes.field_body('\n'.join(indented), *name_messages)
1593 field_node += field_body
1594 if indented:
1595 self.parse_field_body(indented, line_offset, field_body)
1596 return field_node, blank_finish
1597
1598 def parse_field_marker(self, match):
1599 """Extract & return field name from a field marker match."""
1600 field = match.group()[1:] # strip off leading ':'
1601 field = field[:field.rfind(':')] # strip off trailing ':' etc.
1602 return field
1603
1604 def parse_field_body(self, indented, offset, node) -> None:
1605 self.nested_parse(indented, input_offset=offset, node=node)
1606
1607 def option_marker(self, match, context, next_state):
1608 """Option list item."""
1609 optionlist = nodes.option_list()
1610 (optionlist.source, optionlist.line
1611 ) = self.state_machine.get_source_and_line()
1612 try:
1613 listitem, blank_finish = self.option_list_item(match)
1614 except MarkupError as error:
1615 # This shouldn't happen; pattern won't match.
1616 msg = self.reporter.error('Invalid option list marker: %s'
1617 % error)
1618 self.parent += msg
1619 (indented, indent, line_offset, blank_finish
1620 ) = self.state_machine.get_first_known_indented(match.end())
1621 elements = self.block_quote(indented, line_offset)
1622 self.parent += elements
1623 if not blank_finish:
1624 self.parent += self.unindent_warning('Option list')
1625 return [], next_state, []
1626 self.parent += optionlist
1627 optionlist += listitem
1628 offset = self.state_machine.line_offset + 1 # next line
1629 newline_offset, blank_finish = self.nested_list_parse(
1630 self.state_machine.input_lines[offset:],
1631 input_offset=self.state_machine.abs_line_offset() + 1,
1632 node=optionlist, initial_state='OptionList',
1633 blank_finish=blank_finish)
1634 self.goto_line(newline_offset)
1635 if not blank_finish:
1636 self.parent += self.unindent_warning('Option list')
1637 return [], next_state, []
1638
1639 def option_list_item(self, match):
1640 offset = self.state_machine.abs_line_offset()
1641 options = self.parse_option_marker(match)
1642 (indented, indent, line_offset, blank_finish
1643 ) = self.state_machine.get_first_known_indented(match.end())
1644 if not indented: # not an option list item
1645 self.goto_line(offset)
1646 raise statemachine.TransitionCorrection('text')
1647 option_group = nodes.option_group('', *options)
1648 description = nodes.description('\n'.join(indented))
1649 option_list_item = nodes.option_list_item('', option_group,
1650 description)
1651 if indented:
1652 self.nested_parse(indented, input_offset=line_offset,
1653 node=description)
1654 return option_list_item, blank_finish
1655
1656 def parse_option_marker(self, match):
1657 """
1658 Return a list of `node.option` and `node.option_argument` objects,
1659 parsed from an option marker match.
1660
1661 :Exception: `MarkupError` for invalid option markers.
1662 """
1663 optlist = []
1664 # split at ", ", except inside < > (complex arguments)
1665 optionstrings = re.split(r', (?![^<]*>)', match.group().rstrip())
1666 for optionstring in optionstrings:
1667 tokens = optionstring.split()
1668 delimiter = ' '
1669 firstopt = tokens[0].split('=', 1)
1670 if len(firstopt) > 1:
1671 # "--opt=value" form
1672 tokens[:1] = firstopt
1673 delimiter = '='
1674 elif (len(tokens[0]) > 2
1675 and ((tokens[0].startswith('-')
1676 and not tokens[0].startswith('--'))
1677 or tokens[0].startswith('+'))):
1678 # "-ovalue" form
1679 tokens[:1] = [tokens[0][:2], tokens[0][2:]]
1680 delimiter = ''
1681 if len(tokens) > 1 and (tokens[1].startswith('<')
1682 and tokens[-1].endswith('>')):
1683 # "-o <value1 value2>" form; join all values into one token
1684 tokens[1:] = [' '.join(tokens[1:])]
1685 if 0 < len(tokens) <= 2:
1686 option = nodes.option(optionstring)
1687 option += nodes.option_string(tokens[0], tokens[0])
1688 if len(tokens) > 1:
1689 option += nodes.option_argument(tokens[1], tokens[1],
1690 delimiter=delimiter)
1691 optlist.append(option)
1692 else:
1693 raise MarkupError(
1694 'wrong number of option tokens (=%s), should be 1 or 2: '
1695 '"%s"' % (len(tokens), optionstring))
1696 return optlist
1697
1698 def doctest(self, match, context, next_state):
1699 line = self.document.current_line
1700 data = '\n'.join(self.state_machine.get_text_block())
1701 # TODO: Parse with `directives.body.CodeBlock` with
1702 # argument 'pycon' (Python Console) in Docutils 1.0.
1703 n = nodes.doctest_block(data, data)
1704 n.line = line
1705 self.parent += n
1706 return [], next_state, []
1707
1708 def line_block(self, match, context, next_state):
1709 """First line of a line block."""
1710 block = nodes.line_block()
1711 self.parent += block
1712 lineno = self.state_machine.abs_line_number()
1713 (block.source,
1714 block.line) = self.state_machine.get_source_and_line(lineno)
1715 line, messages, blank_finish = self.line_block_line(match, lineno)
1716 block += line
1717 self.parent += messages
1718 if not blank_finish:
1719 offset = self.state_machine.line_offset + 1 # next line
1720 new_line_offset, blank_finish = self.nested_list_parse(
1721 self.state_machine.input_lines[offset:],
1722 input_offset=self.state_machine.abs_line_offset() + 1,
1723 node=block, initial_state='LineBlock',
1724 blank_finish=False)
1725 self.goto_line(new_line_offset)
1726 if not blank_finish:
1727 self.parent += self.reporter.warning(
1728 'Line block ends without a blank line.',
1729 line=lineno+1)
1730 if len(block):
1731 if block[0].indent is None:
1732 block[0].indent = 0
1733 self.nest_line_block_lines(block)
1734 return [], next_state, []
1735
1736 def line_block_line(self, match, lineno):
1737 """Return one line element of a line_block."""
1738 (indented, indent, line_offset, blank_finish
1739 ) = self.state_machine.get_first_known_indented(match.end(),
1740 until_blank=True)
1741 text = '\n'.join(indented)
1742 text_nodes, messages = self.inline_text(text, lineno)
1743 line = nodes.line(text, '', *text_nodes)
1744 (line.source,
1745 line.line) = self.state_machine.get_source_and_line(lineno)
1746 if match.string.rstrip() != '|': # not empty
1747 line.indent = len(match.group(1)) - 1
1748 return line, messages, blank_finish
1749
1750 def nest_line_block_lines(self, block) -> None:
1751 for index in range(1, len(block)):
1752 if block[index].indent is None:
1753 block[index].indent = block[index - 1].indent
1754 self.nest_line_block_segment(block)
1755
1756 def nest_line_block_segment(self, block) -> None:
1757 indents = [item.indent for item in block]
1758 least = min(indents)
1759 new_items = []
1760 new_block = nodes.line_block()
1761 for item in block:
1762 if item.indent > least:
1763 new_block.append(item)
1764 else:
1765 if len(new_block):
1766 self.nest_line_block_segment(new_block)
1767 new_items.append(new_block)
1768 new_block = nodes.line_block()
1769 new_items.append(item)
1770 if len(new_block):
1771 self.nest_line_block_segment(new_block)
1772 new_items.append(new_block)
1773 block[:] = new_items
1774
1775 def grid_table_top(self, match, context, next_state):
1776 """Top border of a full table."""
1777 return self.table_top(match, context, next_state,
1778 self.isolate_grid_table,
1779 tableparser.GridTableParser)
1780
1781 def simple_table_top(self, match, context, next_state):
1782 """Top border of a simple table."""
1783 return self.table_top(match, context, next_state,
1784 self.isolate_simple_table,
1785 tableparser.SimpleTableParser)
1786
1787 def table_top(self, match, context, next_state,
1788 isolate_function, parser_class):
1789 """Top border of a generic table."""
1790 nodelist, blank_finish = self.table(isolate_function, parser_class)
1791 self.parent += nodelist
1792 if not blank_finish:
1793 msg = self.reporter.warning(
1794 'Blank line required after table.',
1795 line=self.state_machine.abs_line_number()+1)
1796 self.parent += msg
1797 return [], next_state, []
1798
1799 def table(self, isolate_function, parser_class):
1800 """Parse a table."""
1801 block, messages, blank_finish = isolate_function()
1802 if block:
1803 try:
1804 parser = parser_class()
1805 tabledata = parser.parse(block)
1806 tableline = (self.state_machine.abs_line_number() - len(block)
1807 + 1)
1808 table = self.build_table(tabledata, tableline)
1809 nodelist = [table] + messages
1810 except tableparser.TableMarkupError as err:
1811 nodelist = self.malformed_table(block, ' '.join(err.args),
1812 offset=err.offset) + messages
1813 else:
1814 nodelist = messages
1815 return nodelist, blank_finish
1816
1817 def isolate_grid_table(self):
1818 messages = []
1819 blank_finish = True
1820 try:
1821 block = self.state_machine.get_text_block(flush_left=True)
1822 except statemachine.UnexpectedIndentationError as err:
1823 block, src, srcline = err.args
1824 messages.append(self.reporter.error('Unexpected indentation.',
1825 source=src, line=srcline))
1826 blank_finish = False
1827 block.disconnect()
1828 # for East Asian chars:
1829 block.pad_double_width(self.double_width_pad_char)
1830 width = len(block[0].strip())
1831 for i in range(len(block)):
1832 block[i] = block[i].strip()
1833 if block[i][0] not in '+|': # check left edge
1834 blank_finish = False
1835 self.state_machine.previous_line(len(block) - i)
1836 del block[i:]
1837 break
1838 if not self.grid_table_top_pat.match(block[-1]): # find bottom
1839 # from second-last to third line of table:
1840 for i in range(len(block) - 2, 1, -1):
1841 if self.grid_table_top_pat.match(block[i]):
1842 self.state_machine.previous_line(len(block) - i + 1)
1843 del block[i+1:]
1844 blank_finish = False
1845 break
1846 else:
1847 detail = 'Bottom border missing or corrupt.'
1848 messages.extend(self.malformed_table(block, detail, i))
1849 return [], messages, blank_finish
1850 for i in range(len(block)): # check right edge
1851 if len(strip_combining_chars(block[i])
1852 ) != width or block[i][-1] not in '+|':
1853 detail = 'Right border not aligned or missing.'
1854 messages.extend(self.malformed_table(block, detail, i))
1855 return [], messages, blank_finish
1856 return block, messages, blank_finish
1857
1858 def isolate_simple_table(self):
1859 start = self.state_machine.line_offset
1860 lines = self.state_machine.input_lines
1861 limit = len(lines) - 1
1862 toplen = len(lines[start].strip())
1863 pattern_match = self.simple_table_border_pat.match
1864 found = 0
1865 found_at = None
1866 i = start + 1
1867 while i <= limit:
1868 line = lines[i]
1869 match = pattern_match(line)
1870 if match:
1871 if len(line.strip()) != toplen:
1872 self.state_machine.next_line(i - start)
1873 messages = self.malformed_table(
1874 lines[start:i+1], 'Bottom border or header rule does '
1875 'not match top border.', i-start)
1876 return [], messages, i == limit or not lines[i+1].strip()
1877 found += 1
1878 found_at = i
1879 if found == 2 or i == limit or not lines[i+1].strip():
1880 end = i
1881 break
1882 i += 1
1883 else: # reached end of input_lines
1884 details = 'No bottom table border found'
1885 if found:
1886 details += ' or no blank line after table bottom'
1887 self.state_machine.next_line(found_at - start)
1888 block = lines[start:found_at+1]
1889 else:
1890 self.state_machine.next_line(i - start - 1)
1891 block = lines[start:]
1892 messages = self.malformed_table(block, details + '.')
1893 return [], messages, not found
1894 self.state_machine.next_line(end - start)
1895 block = lines[start:end+1]
1896 # for East Asian chars:
1897 block.pad_double_width(self.double_width_pad_char)
1898 return block, [], end == limit or not lines[end+1].strip()
1899
1900 def malformed_table(self, block, detail='', offset=0):
1901 block.replace(self.double_width_pad_char, '')
1902 data = '\n'.join(block)
1903 message = 'Malformed table.'
1904 startline = self.state_machine.abs_line_number() - len(block) + 1
1905 if detail:
1906 message += '\n' + detail
1907 error = self.reporter.error(message, nodes.literal_block(data, data),
1908 line=startline+offset)
1909 return [error]
1910
1911 def build_table(self, tabledata, tableline, stub_columns=0, widths=None):
1912 colwidths, headrows, bodyrows = tabledata
1913 table = nodes.table()
1914 if widths == 'auto':
1915 table['classes'] += ['colwidths-auto']
1916 elif widths: # "grid" or list of integers
1917 table['classes'] += ['colwidths-given']
1918 tgroup = nodes.tgroup(cols=len(colwidths))
1919 table += tgroup
1920 for colwidth in colwidths:
1921 colspec = nodes.colspec(colwidth=colwidth)
1922 if stub_columns:
1923 colspec.attributes['stub'] = True
1924 stub_columns -= 1
1925 tgroup += colspec
1926 if headrows:
1927 thead = nodes.thead()
1928 tgroup += thead
1929 for row in headrows:
1930 thead += self.build_table_row(row, tableline)
1931 tbody = nodes.tbody()
1932 tgroup += tbody
1933 for row in bodyrows:
1934 tbody += self.build_table_row(row, tableline)
1935 return table
1936
1937 def build_table_row(self, rowdata, tableline):
1938 row = nodes.row()
1939 for cell in rowdata:
1940 if cell is None:
1941 continue
1942 morerows, morecols, offset, cellblock = cell
1943 attributes = {}
1944 if morerows:
1945 attributes['morerows'] = morerows
1946 if morecols:
1947 attributes['morecols'] = morecols
1948 entry = nodes.entry(**attributes)
1949 row += entry
1950 if ''.join(cellblock):
1951 self.nested_parse(cellblock, input_offset=tableline+offset,
1952 node=entry)
1953 return row
1954
1955 explicit = Struct()
1956 """Patterns and constants used for explicit markup recognition."""
1957
1958 explicit.patterns = Struct(
1959 target=re.compile(r"""
1960 (
1961 _ # anonymous target
1962 | # *OR*
1963 (?!_) # no underscore at the beginning
1964 (?P<quote>`?) # optional open quote
1965 (?![ `]) # first char. not space or
1966 # backquote
1967 (?P<name> # reference name
1968 .+?
1969 )
1970 %(non_whitespace_escape_before)s
1971 (?P=quote) # close quote if open quote used
1972 )
1973 (?<!(?<!\x00):) # no unescaped colon at end
1974 %(non_whitespace_escape_before)s
1975 [ ]? # optional space
1976 : # end of reference name
1977 ([ ]+|$) # followed by whitespace
1978 """ % vars(Inliner), re.VERBOSE),
1979 reference=re.compile(r"""
1980 (
1981 (?P<simple>%(simplename)s)_
1982 | # *OR*
1983 ` # open backquote
1984 (?![ ]) # not space
1985 (?P<phrase>.+?) # hyperlink phrase
1986 %(non_whitespace_escape_before)s
1987 `_ # close backquote,
1988 # reference mark
1989 )
1990 $ # end of string
1991 """ % vars(Inliner), re.VERBOSE),
1992 substitution=re.compile(r"""
1993 (
1994 (?![ ]) # first char. not space
1995 (?P<name>.+?) # substitution text
1996 %(non_whitespace_escape_before)s
1997 \| # close delimiter
1998 )
1999 ([ ]+|$) # followed by whitespace
2000 """ % vars(Inliner),
2001 re.VERBOSE),)
2002
2003 def footnote(self, match):
2004 src, srcline = self.state_machine.get_source_and_line()
2005 (indented, indent, offset, blank_finish
2006 ) = self.state_machine.get_first_known_indented(match.end())
2007 label = match.group(1)
2008 name = normalize_name(label)
2009 footnote = nodes.footnote('\n'.join(indented))
2010 footnote.source = src
2011 footnote.line = srcline
2012 if name[0] == '#': # auto-numbered
2013 name = name[1:] # autonumber label
2014 footnote['auto'] = 1
2015 if name:
2016 footnote['names'].append(name)
2017 self.document.note_autofootnote(footnote)
2018 elif name == '*': # auto-symbol
2019 name = ''
2020 footnote['auto'] = '*'
2021 self.document.note_symbol_footnote(footnote)
2022 else: # manually numbered
2023 footnote += nodes.label('', label)
2024 footnote['names'].append(name)
2025 self.document.note_footnote(footnote)
2026 if name:
2027 self.document.note_explicit_target(footnote, footnote)
2028 else:
2029 self.document.set_id(footnote, footnote)
2030 if indented:
2031 self.nested_parse(indented, input_offset=offset, node=footnote)
2032 else:
2033 footnote += self.reporter.warning('Footnote content expected.')
2034 return [footnote], blank_finish
2035
2036 def citation(self, match):
2037 src, srcline = self.state_machine.get_source_and_line()
2038 (indented, indent, offset, blank_finish
2039 ) = self.state_machine.get_first_known_indented(match.end())
2040 label = match.group(1)
2041 name = normalize_name(label)
2042 citation = nodes.citation('\n'.join(indented))
2043 citation.source = src
2044 citation.line = srcline
2045 citation += nodes.label('', label)
2046 citation['names'].append(name)
2047 self.document.note_citation(citation)
2048 self.document.note_explicit_target(citation, citation)
2049 if indented:
2050 self.nested_parse(indented, input_offset=offset, node=citation)
2051 else:
2052 citation += self.reporter.warning('Citation content expected.')
2053 return [citation], blank_finish
2054
2055 def hyperlink_target(self, match):
2056 pattern = self.explicit.patterns.target
2057 lineno = self.state_machine.abs_line_number()
2058 (block, indent, offset, blank_finish
2059 ) = self.state_machine.get_first_known_indented(
2060 match.end(), until_blank=True, strip_indent=False)
2061 blocktext = match.string[:match.end()] + '\n'.join(block)
2062 block = [escape2null(line) for line in block]
2063 escaped = block[0]
2064 blockindex = 0
2065 while True:
2066 targetmatch = pattern.match(escaped)
2067 if targetmatch:
2068 break
2069 blockindex += 1
2070 try:
2071 escaped += block[blockindex]
2072 except IndexError:
2073 raise MarkupError('malformed hyperlink target.')
2074 del block[:blockindex]
2075 block[0] = (block[0] + ' ')[targetmatch.end()-len(escaped)-1:].strip()
2076 target = self.make_target(block, blocktext, lineno,
2077 targetmatch.group('name'))
2078 return [target], blank_finish
2079
2080 def make_target(self, block, block_text, lineno, target_name):
2081 target_type, data = self.parse_target(block, block_text, lineno)
2082 if target_type == 'refname':
2083 target = nodes.target(block_text, '', refname=normalize_name(data))
2084 target.indirect_reference_name = data
2085 self.add_target(target_name, '', target, lineno)
2086 self.document.note_indirect_target(target)
2087 return target
2088 elif target_type == 'refuri':
2089 target = nodes.target(block_text, '')
2090 self.add_target(target_name, data, target, lineno)
2091 return target
2092 else:
2093 return data
2094
2095 def parse_target(self, block, block_text, lineno):
2096 """
2097 Determine the type of reference of a target.
2098
2099 :Return: A 2-tuple, one of:
2100
2101 - 'refname' and the indirect reference name
2102 - 'refuri' and the URI
2103 - 'malformed' and a system_message node
2104 """
2105 if block and block[-1].strip()[-1:] == '_': # possible indirect target
2106 reference = ' '.join(line.strip() for line in block)
2107 refname = self.is_reference(reference)
2108 if refname:
2109 return 'refname', refname
2110 ref_parts = split_escaped_whitespace(' '.join(block))
2111 reference = ' '.join(''.join(unescape(part).split())
2112 for part in ref_parts)
2113 return 'refuri', reference
2114
2115 def is_reference(self, reference):
2116 match = self.explicit.patterns.reference.match(
2117 whitespace_normalize_name(reference))
2118 if not match:
2119 return None
2120 return unescape(match.group('simple') or match.group('phrase'))
2121
2122 def add_target(self, targetname, refuri, target, lineno):
2123 target.line = lineno
2124 if targetname:
2125 name = normalize_name(unescape(targetname))
2126 target['names'].append(name)
2127 if refuri:
2128 uri = self.inliner.adjust_uri(refuri)
2129 if uri:
2130 target['refuri'] = uri
2131 else:
2132 raise ApplicationError('problem with URI: %r' % refuri)
2133 self.document.note_explicit_target(target, self.parent)
2134 else: # anonymous target
2135 if refuri:
2136 target['refuri'] = refuri
2137 target['anonymous'] = True
2138 self.document.note_anonymous_target(target)
2139
2140 def substitution_def(self, match):
2141 pattern = self.explicit.patterns.substitution
2142 src, srcline = self.state_machine.get_source_and_line()
2143 (block, indent, offset, blank_finish
2144 ) = self.state_machine.get_first_known_indented(match.end(),
2145 strip_indent=False)
2146 blocktext = (match.string[:match.end()] + '\n'.join(block))
2147 block.disconnect()
2148 escaped = escape2null(block[0].rstrip())
2149 blockindex = 0
2150 while True:
2151 subdefmatch = pattern.match(escaped)
2152 if subdefmatch:
2153 break
2154 blockindex += 1
2155 try:
2156 escaped = escaped + ' ' + escape2null(
2157 block[blockindex].strip())
2158 except IndexError:
2159 raise MarkupError('malformed substitution definition.')
2160 del block[:blockindex] # strip out the substitution marker
2161 start = subdefmatch.end()-len(escaped)-1
2162 block[0] = (block[0].strip() + ' ')[start:-1]
2163 if not block[0]:
2164 del block[0]
2165 offset += 1
2166 while block and not block[-1].strip():
2167 block.pop()
2168 subname = subdefmatch.group('name')
2169 substitution_node = nodes.substitution_definition(blocktext)
2170 substitution_node.source = src
2171 substitution_node.line = srcline
2172 if not block:
2173 msg = self.reporter.warning(
2174 'Substitution definition "%s" missing contents.' % subname,
2175 nodes.literal_block(blocktext, blocktext),
2176 source=src, line=srcline)
2177 return [msg], blank_finish
2178 block[0] = block[0].strip()
2179 substitution_node['names'].append(
2180 nodes.whitespace_normalize_name(subname))
2181 new_abs_offset, blank_finish = self.nested_list_parse(
2182 block, input_offset=offset, node=substitution_node,
2183 initial_state='SubstitutionDef', blank_finish=blank_finish)
2184 i = 0
2185 for node in substitution_node[:]:
2186 if not (isinstance(node, nodes.Inline)
2187 or isinstance(node, nodes.Text)):
2188 self.parent += substitution_node[i]
2189 del substitution_node[i]
2190 else:
2191 i += 1
2192 for node in substitution_node.findall(nodes.Element,
2193 include_self=False):
2194 if isinstance(node, nodes.problematic):
2195 msg = self.reporter.error(
2196 'Problematic content in substitution definition',
2197 nodes.literal_block('', blocktext),
2198 source=src, line=srcline)
2199 msg.append(nodes.block_quote(
2200 '', nodes.paragraph('', '', *substitution_node.children)))
2201 return [msg], blank_finish
2202 illegal = self.disallowed_inside_substitution_definitions(node)
2203 if illegal:
2204 msg = self.reporter.error(f'{illegal} are not supported in '
2205 'a substitution definition.',
2206 nodes.literal_block('', blocktext),
2207 source=src, line=srcline)
2208 return [msg], blank_finish
2209 if len(substitution_node) == 0:
2210 msg = self.reporter.warning(
2211 'Substitution definition "%s" empty or invalid.' % subname,
2212 nodes.literal_block(blocktext, blocktext),
2213 source=src, line=srcline)
2214 return [msg], blank_finish
2215 self.document.note_substitution_def(
2216 substitution_node, subname, self.parent)
2217 return [substitution_node], blank_finish
2218
2219 def disallowed_inside_substitution_definitions(self, node) -> str:
2220 if isinstance(node, nodes.reference) and node.get('anonymous'):
2221 return 'Anonymous references'
2222 if isinstance(node, nodes.footnote_reference) and node.get('auto'):
2223 return 'References to auto-numbered and auto-symbol footnotes'
2224 if node['names'] or node['ids']:
2225 return 'Targets (names and identifiers)'
2226 else:
2227 return ''
2228
2229 def directive(self, match, **option_presets):
2230 """Returns a 2-tuple: list of nodes, and a "blank finish" boolean."""
2231 type_name = match.group(1)
2232 directive_class, messages = directives.directive(
2233 type_name, self.memo.language, self.document)
2234 self.parent += messages
2235 if directive_class:
2236 return self.run_directive(
2237 directive_class, match, type_name, option_presets)
2238 else:
2239 return self.unknown_directive(type_name)
2240
2241 def run_directive(self, directive, match, type_name, option_presets):
2242 """
2243 Parse a directive then run its directive function.
2244
2245 Parameters:
2246
2247 - `directive`: The class implementing the directive. Must be
2248 a subclass of `rst.Directive`.
2249
2250 - `match`: A regular expression match object which matched the first
2251 line of the directive.
2252
2253 - `type_name`: The directive name, as used in the source text.
2254
2255 - `option_presets`: A dictionary of preset options, defaults for the
2256 directive options. Currently, only an "alt" option is passed by
2257 substitution definitions (value: the substitution name), which may
2258 be used by an embedded image directive.
2259
2260 Returns a 2-tuple: list of nodes, and a "blank finish" boolean.
2261 """
2262 if isinstance(directive, (FunctionType, MethodType)):
2263 from docutils.parsers.rst import convert_directive_function
2264 directive = convert_directive_function(directive)
2265 lineno = self.state_machine.abs_line_number()
2266 initial_line_offset = self.state_machine.line_offset
2267 (indented, indent, line_offset, blank_finish
2268 ) = self.state_machine.get_first_known_indented(match.end(),
2269 strip_top=0)
2270 block_text = '\n'.join(self.state_machine.input_lines[
2271 initial_line_offset : self.state_machine.line_offset + 1]) # noqa: E203,E501
2272 try:
2273 arguments, options, content, content_offset = (
2274 self.parse_directive_block(indented, line_offset,
2275 directive, option_presets))
2276 except MarkupError as detail:
2277 error = self.reporter.error(
2278 'Error in "%s" directive:\n%s.' % (type_name,
2279 ' '.join(detail.args)),
2280 nodes.literal_block(block_text, block_text), line=lineno)
2281 return [error], blank_finish
2282 directive_instance = directive(
2283 type_name, arguments, options, content, lineno,
2284 content_offset, block_text, self, self.state_machine)
2285 try:
2286 result = directive_instance.run()
2287 except docutils.parsers.rst.DirectiveError as error:
2288 msg_node = self.reporter.system_message(error.level, error.msg,
2289 line=lineno)
2290 msg_node += nodes.literal_block(block_text, block_text)
2291 result = [msg_node]
2292 assert isinstance(result, list), \
2293 'Directive "%s" must return a list of nodes.' % type_name
2294 for i in range(len(result)):
2295 assert isinstance(result[i], nodes.Node), \
2296 ('Directive "%s" returned non-Node object (index %s): %r'
2297 % (type_name, i, result[i]))
2298 return (result,
2299 blank_finish or self.state_machine.is_next_line_blank())
2300
2301 def parse_directive_block(self, indented, line_offset, directive,
2302 option_presets):
2303 option_spec = directive.option_spec
2304 has_content = directive.has_content
2305 if indented and not indented[0].strip():
2306 indented.trim_start()
2307 line_offset += 1
2308 while indented and not indented[-1].strip():
2309 indented.trim_end()
2310 if indented and (directive.required_arguments
2311 or directive.optional_arguments
2312 or option_spec):
2313 for i, line in enumerate(indented):
2314 if not line.strip():
2315 break
2316 else:
2317 i += 1
2318 arg_block = indented[:i]
2319 content = indented[i+1:]
2320 content_offset = line_offset + i + 1
2321 else:
2322 content = indented
2323 content_offset = line_offset
2324 arg_block = []
2325 if option_spec:
2326 options, arg_block = self.parse_directive_options(
2327 option_presets, option_spec, arg_block)
2328 else:
2329 options = {}
2330 if arg_block and not (directive.required_arguments
2331 or directive.optional_arguments):
2332 content = arg_block + indented[i:]
2333 content_offset = line_offset
2334 arg_block = []
2335 while content and not content[0].strip():
2336 content.trim_start()
2337 content_offset += 1
2338 if directive.required_arguments or directive.optional_arguments:
2339 arguments = self.parse_directive_arguments(
2340 directive, arg_block)
2341 else:
2342 arguments = []
2343 if content and not has_content:
2344 raise MarkupError('no content permitted')
2345 return arguments, options, content, content_offset
2346
2347 def parse_directive_options(self, option_presets, option_spec, arg_block):
2348 options = option_presets.copy()
2349 for i, line in enumerate(arg_block):
2350 if re.match(Body.patterns['field_marker'], line):
2351 opt_block = arg_block[i:]
2352 arg_block = arg_block[:i]
2353 break
2354 else:
2355 opt_block = []
2356 if opt_block:
2357 success, data = self.parse_extension_options(option_spec,
2358 opt_block)
2359 if success: # data is a dict of options
2360 options.update(data)
2361 else: # data is an error string
2362 raise MarkupError(data)
2363 return options, arg_block
2364
2365 def parse_directive_arguments(self, directive, arg_block):
2366 required = directive.required_arguments
2367 optional = directive.optional_arguments
2368 arg_text = '\n'.join(arg_block)
2369 arguments = arg_text.split()
2370 if len(arguments) < required:
2371 raise MarkupError('%s argument(s) required, %s supplied'
2372 % (required, len(arguments)))
2373 elif len(arguments) > required + optional:
2374 if directive.final_argument_whitespace:
2375 arguments = arg_text.split(None, required + optional - 1)
2376 else:
2377 raise MarkupError(
2378 'maximum %s argument(s) allowed, %s supplied'
2379 % (required + optional, len(arguments)))
2380 return arguments
2381
2382 def parse_extension_options(self, option_spec, datalines):
2383 """
2384 Parse `datalines` for a field list containing extension options
2385 matching `option_spec`.
2386
2387 :Parameters:
2388 - `option_spec`: a mapping of option name to conversion
2389 function, which should raise an exception on bad input.
2390 - `datalines`: a list of input strings.
2391
2392 :Return:
2393 - Success value, 1 or 0.
2394 - An option dictionary on success, an error string on failure.
2395 """
2396 node = nodes.field_list()
2397 newline_offset, blank_finish = self.nested_list_parse(
2398 datalines, 0, node, initial_state='ExtensionOptions',
2399 blank_finish=True)
2400 if newline_offset != len(datalines): # incomplete parse of block
2401 return 0, 'invalid option block'
2402 try:
2403 options = utils.extract_extension_options(node, option_spec)
2404 except KeyError as detail:
2405 return 0, 'unknown option: "%s"' % detail.args[0]
2406 except (ValueError, TypeError) as detail:
2407 return 0, 'invalid option value: %s' % ' '.join(detail.args)
2408 except utils.ExtensionOptionError as detail:
2409 return 0, 'invalid option data: %s' % ' '.join(detail.args)
2410 if blank_finish:
2411 return 1, options
2412 else:
2413 return 0, 'option data incompletely parsed'
2414
2415 def unknown_directive(self, type_name):
2416 lineno = self.state_machine.abs_line_number()
2417 (indented, indent, offset, blank_finish
2418 ) = self.state_machine.get_first_known_indented(0, strip_indent=False)
2419 text = '\n'.join(indented)
2420 error = self.reporter.error('Unknown directive type "%s".' % type_name,
2421 nodes.literal_block(text, text),
2422 line=lineno)
2423 return [error], blank_finish
2424
2425 def comment(self, match):
2426 if self.state_machine.is_next_line_blank():
2427 first_comment_line = match.string[match.end():]
2428 if not first_comment_line.strip(): # empty comment
2429 return [nodes.comment()], True # "A tiny but practical wart."
2430 if first_comment_line.startswith('end of inclusion from "'):
2431 # cf. parsers.rst.directives.misc.Include
2432 self.document.include_log.pop()
2433 return [], True
2434 (indented, indent, offset, blank_finish
2435 ) = self.state_machine.get_first_known_indented(match.end())
2436 while indented and not indented[-1].strip():
2437 indented.trim_end()
2438 text = '\n'.join(indented)
2439 return [nodes.comment(text, text)], blank_finish
2440
2441 explicit.constructs = [
2442 (footnote,
2443 re.compile(r"""
2444 \.\.[ ]+ # explicit markup start
2445 \[
2446 ( # footnote label:
2447 [0-9]+ # manually numbered footnote
2448 | # *OR*
2449 \# # anonymous auto-numbered footnote
2450 | # *OR*
2451 \#%s # auto-number ed?) footnote label
2452 | # *OR*
2453 \* # auto-symbol footnote
2454 )
2455 \]
2456 ([ ]+|$) # whitespace or end of line
2457 """ % Inliner.simplename, re.VERBOSE)),
2458 (citation,
2459 re.compile(r"""
2460 \.\.[ ]+ # explicit markup start
2461 \[(%s)\] # citation label
2462 ([ ]+|$) # whitespace or end of line
2463 """ % Inliner.simplename, re.VERBOSE)),
2464 (hyperlink_target,
2465 re.compile(r"""
2466 \.\.[ ]+ # explicit markup start
2467 _ # target indicator
2468 (?![ ]|$) # first char. not space or EOL
2469 """, re.VERBOSE)),
2470 (substitution_def,
2471 re.compile(r"""
2472 \.\.[ ]+ # explicit markup start
2473 \| # substitution indicator
2474 (?![ ]|$) # first char. not space or EOL
2475 """, re.VERBOSE)),
2476 (directive,
2477 re.compile(r"""
2478 \.\.[ ]+ # explicit markup start
2479 (%s) # directive name
2480 [ ]? # optional space
2481 :: # directive delimiter
2482 ([ ]+|$) # whitespace or end of line
2483 """ % Inliner.simplename, re.VERBOSE))]
2484
2485 def explicit_markup(self, match, context, next_state):
2486 """Footnotes, hyperlink targets, directives, comments."""
2487 nodelist, blank_finish = self.explicit_construct(match)
2488 self.parent += nodelist
2489 self.explicit_list(blank_finish)
2490 return [], next_state, []
2491
2492 def explicit_construct(self, match):
2493 """Determine which explicit construct this is, parse & return it."""
2494 errors = []
2495 for method, pattern in self.explicit.constructs:
2496 expmatch = pattern.match(match.string)
2497 if expmatch:
2498 try:
2499 return method(self, expmatch)
2500 except MarkupError as error:
2501 lineno = self.state_machine.abs_line_number()
2502 message = ' '.join(error.args)
2503 errors.append(self.reporter.warning(message, line=lineno))
2504 break
2505 nodelist, blank_finish = self.comment(match)
2506 return nodelist + errors, blank_finish
2507
2508 def explicit_list(self, blank_finish) -> None:
2509 """
2510 Create a nested state machine for a series of explicit markup
2511 constructs (including anonymous hyperlink targets).
2512 """
2513 offset = self.state_machine.line_offset + 1 # next line
2514 newline_offset, blank_finish = self.nested_list_parse(
2515 self.state_machine.input_lines[offset:],
2516 input_offset=self.state_machine.abs_line_offset() + 1,
2517 node=self.parent, initial_state='Explicit',
2518 blank_finish=blank_finish)
2519 self.goto_line(newline_offset)
2520 if not blank_finish:
2521 self.parent += self.unindent_warning('Explicit markup')
2522
2523 def anonymous(self, match, context, next_state):
2524 """Anonymous hyperlink targets."""
2525 nodelist, blank_finish = self.anonymous_target(match)
2526 self.parent += nodelist
2527 self.explicit_list(blank_finish)
2528 return [], next_state, []
2529
2530 def anonymous_target(self, match):
2531 lineno = self.state_machine.abs_line_number()
2532 (block, indent, offset, blank_finish
2533 ) = self.state_machine.get_first_known_indented(match.end(),
2534 until_blank=True)
2535 blocktext = match.string[:match.end()] + '\n'.join(block)
2536 block = [escape2null(line) for line in block]
2537 target = self.make_target(block, blocktext, lineno, '')
2538 return [target], blank_finish
2539
2540 def line(self, match, context, next_state):
2541 """Section title overline or transition marker."""
2542 if self.state_machine.match_titles:
2543 return [match.string], 'Line', []
2544 elif match.string.strip() == '::':
2545 raise statemachine.TransitionCorrection('text')
2546 elif len(match.string.strip()) < 4:
2547 msg = self.reporter.info(
2548 'Unexpected possible title overline or transition.\n'
2549 "Treating it as ordinary text because it's so short.",
2550 line=self.state_machine.abs_line_number())
2551 self.parent += msg
2552 raise statemachine.TransitionCorrection('text')
2553 else:
2554 blocktext = self.state_machine.line
2555 msg = self.reporter.error(
2556 'Unexpected section title or transition.',
2557 nodes.literal_block(blocktext, blocktext),
2558 line=self.state_machine.abs_line_number())
2559 self.parent += msg
2560 return [], next_state, []
2561
2562 def text(self, match, context, next_state):
2563 """Titles, definition lists, paragraphs."""
2564 return [match.string], 'Text', []
2565
2566
2567class RFC2822Body(Body):
2568
2569 """
2570 RFC2822 headers are only valid as the first constructs in documents. As
2571 soon as anything else appears, the `Body` state should take over.
2572 """
2573
2574 patterns = Body.patterns.copy() # can't modify the original
2575 patterns['rfc2822'] = r'[!-9;-~]+:( +|$)'
2576 initial_transitions = [(name, 'Body')
2577 for name in Body.initial_transitions]
2578 initial_transitions.insert(-1, ('rfc2822', 'Body')) # just before 'text'
2579
2580 def rfc2822(self, match, context, next_state):
2581 """RFC2822-style field list item."""
2582 fieldlist = nodes.field_list(classes=['rfc2822'])
2583 self.parent += fieldlist
2584 field, blank_finish = self.rfc2822_field(match)
2585 fieldlist += field
2586 offset = self.state_machine.line_offset + 1 # next line
2587 newline_offset, blank_finish = self.nested_list_parse(
2588 self.state_machine.input_lines[offset:],
2589 input_offset=self.state_machine.abs_line_offset() + 1,
2590 node=fieldlist, initial_state='RFC2822List',
2591 blank_finish=blank_finish)
2592 self.goto_line(newline_offset)
2593 if not blank_finish:
2594 self.parent += self.unindent_warning(
2595 'RFC2822-style field list')
2596 return [], next_state, []
2597
2598 def rfc2822_field(self, match):
2599 name = match.string[:match.string.find(':')]
2600 (indented, indent, line_offset, blank_finish
2601 ) = self.state_machine.get_first_known_indented(match.end(),
2602 until_blank=True)
2603 fieldnode = nodes.field()
2604 fieldnode += nodes.field_name(name, name)
2605 fieldbody = nodes.field_body('\n'.join(indented))
2606 fieldnode += fieldbody
2607 if indented:
2608 self.nested_parse(indented, input_offset=line_offset,
2609 node=fieldbody)
2610 return fieldnode, blank_finish
2611
2612
2613class SpecializedBody(Body):
2614
2615 """
2616 Superclass for second and subsequent compound element members. Compound
2617 elements are lists and list-like constructs.
2618
2619 All transition methods are disabled (redefined as `invalid_input`).
2620 Override individual methods in subclasses to re-enable.
2621
2622 For example, once an initial bullet list item, say, is recognized, the
2623 `BulletList` subclass takes over, with a "bullet_list" node as its
2624 container. Upon encountering the initial bullet list item, `Body.bullet`
2625 calls its ``self.nested_list_parse`` (`RSTState.nested_list_parse`), which
2626 starts up a nested parsing session with `BulletList` as the initial state.
2627 Only the ``bullet`` transition method is enabled in `BulletList`; as long
2628 as only bullet list items are encountered, they are parsed and inserted
2629 into the container. The first construct which is *not* a bullet list item
2630 triggers the `invalid_input` method, which ends the nested parse and
2631 closes the container. `BulletList` needs to recognize input that is
2632 invalid in the context of a bullet list, which means everything *other
2633 than* bullet list items, so it inherits the transition list created in
2634 `Body`.
2635 """
2636
2637 def invalid_input(self, match=None, context=None, next_state=None):
2638 """Not a compound element member. Abort this state machine."""
2639 self.state_machine.previous_line() # back up so parent SM can reassess
2640 raise EOFError
2641
2642 indent = invalid_input
2643 bullet = invalid_input
2644 enumerator = invalid_input
2645 field_marker = invalid_input
2646 option_marker = invalid_input
2647 doctest = invalid_input
2648 line_block = invalid_input
2649 grid_table_top = invalid_input
2650 simple_table_top = invalid_input
2651 explicit_markup = invalid_input
2652 anonymous = invalid_input
2653 line = invalid_input
2654 text = invalid_input
2655
2656
2657class BulletList(SpecializedBody):
2658
2659 """Second and subsequent bullet_list list_items."""
2660
2661 def bullet(self, match, context, next_state):
2662 """Bullet list item."""
2663 if match.string[0] != self.parent['bullet']:
2664 # different bullet: new list
2665 self.invalid_input()
2666 listitem, blank_finish = self.list_item(match.end())
2667 self.parent += listitem
2668 self.blank_finish = blank_finish
2669 return [], next_state, []
2670
2671
2672class DefinitionList(SpecializedBody):
2673
2674 """Second and subsequent definition_list_items."""
2675
2676 def text(self, match, context, next_state):
2677 """Definition lists."""
2678 return [match.string], 'Definition', []
2679
2680
2681class EnumeratedList(SpecializedBody):
2682
2683 """Second and subsequent enumerated_list list_items."""
2684
2685 def enumerator(self, match, context, next_state):
2686 """Enumerated list item."""
2687 format, sequence, text, ordinal = self.parse_enumerator(
2688 match, self.parent['enumtype'])
2689 if (format != self.format
2690 or (sequence != '#' and (sequence != self.parent['enumtype']
2691 or self.auto
2692 or ordinal != (self.lastordinal + 1)))
2693 or not self.is_enumerated_list_item(ordinal, sequence, format)):
2694 # different enumeration: new list
2695 self.invalid_input()
2696 if sequence == '#':
2697 self.auto = 1
2698 listitem, blank_finish = self.list_item(match.end())
2699 self.parent += listitem
2700 self.blank_finish = blank_finish
2701 self.lastordinal = ordinal
2702 return [], next_state, []
2703
2704
2705class FieldList(SpecializedBody):
2706
2707 """Second and subsequent field_list fields."""
2708
2709 def field_marker(self, match, context, next_state):
2710 """Field list field."""
2711 field, blank_finish = self.field(match)
2712 self.parent += field
2713 self.blank_finish = blank_finish
2714 return [], next_state, []
2715
2716
2717class OptionList(SpecializedBody):
2718
2719 """Second and subsequent option_list option_list_items."""
2720
2721 def option_marker(self, match, context, next_state):
2722 """Option list item."""
2723 try:
2724 option_list_item, blank_finish = self.option_list_item(match)
2725 except MarkupError:
2726 self.invalid_input()
2727 self.parent += option_list_item
2728 self.blank_finish = blank_finish
2729 return [], next_state, []
2730
2731
2732class RFC2822List(SpecializedBody, RFC2822Body):
2733
2734 """Second and subsequent RFC2822-style field_list fields."""
2735
2736 patterns = RFC2822Body.patterns
2737 initial_transitions = RFC2822Body.initial_transitions
2738
2739 def rfc2822(self, match, context, next_state):
2740 """RFC2822-style field list item."""
2741 field, blank_finish = self.rfc2822_field(match)
2742 self.parent += field
2743 self.blank_finish = blank_finish
2744 return [], 'RFC2822List', []
2745
2746 blank = SpecializedBody.invalid_input
2747
2748
2749class ExtensionOptions(FieldList):
2750
2751 """
2752 Parse field_list fields for extension options.
2753
2754 No nested parsing is done (including inline markup parsing).
2755 """
2756
2757 def parse_field_body(self, indented, offset, node) -> None:
2758 """Override `Body.parse_field_body` for simpler parsing."""
2759 lines = []
2760 for line in list(indented) + ['']:
2761 if line.strip():
2762 lines.append(line)
2763 elif lines:
2764 text = '\n'.join(lines)
2765 node += nodes.paragraph(text, text)
2766 lines = []
2767
2768
2769class LineBlock(SpecializedBody):
2770
2771 """Second and subsequent lines of a line_block."""
2772
2773 blank = SpecializedBody.invalid_input
2774
2775 def line_block(self, match, context, next_state):
2776 """New line of line block."""
2777 lineno = self.state_machine.abs_line_number()
2778 line, messages, blank_finish = self.line_block_line(match, lineno)
2779 self.parent += line
2780 self.parent.parent += messages
2781 self.blank_finish = blank_finish
2782 return [], next_state, []
2783
2784
2785class Explicit(SpecializedBody):
2786
2787 """Second and subsequent explicit markup construct."""
2788
2789 def explicit_markup(self, match, context, next_state):
2790 """Footnotes, hyperlink targets, directives, comments."""
2791 nodelist, blank_finish = self.explicit_construct(match)
2792 self.parent += nodelist
2793 self.blank_finish = blank_finish
2794 return [], next_state, []
2795
2796 def anonymous(self, match, context, next_state):
2797 """Anonymous hyperlink targets."""
2798 nodelist, blank_finish = self.anonymous_target(match)
2799 self.parent += nodelist
2800 self.blank_finish = blank_finish
2801 return [], next_state, []
2802
2803 blank = SpecializedBody.invalid_input
2804
2805
2806class SubstitutionDef(Body):
2807
2808 """
2809 Parser for the contents of a substitution_definition element.
2810 """
2811
2812 patterns = {
2813 'embedded_directive': re.compile(r'(%s)::( +|$)'
2814 % Inliner.simplename),
2815 'text': r''}
2816 initial_transitions = ['embedded_directive', 'text']
2817
2818 def embedded_directive(self, match, context, next_state):
2819 nodelist, blank_finish = self.directive(match,
2820 alt=self.parent['names'][0])
2821 self.parent += nodelist
2822 if not self.state_machine.at_eof():
2823 self.blank_finish = blank_finish
2824 raise EOFError
2825
2826 def text(self, match, context, next_state):
2827 if not self.state_machine.at_eof():
2828 self.blank_finish = self.state_machine.is_next_line_blank()
2829 raise EOFError
2830
2831
2832class Text(RSTState):
2833
2834 """
2835 Classifier of second line of a text block.
2836
2837 Could be a paragraph, a definition list item, or a title.
2838 """
2839
2840 patterns = {'underline': Body.patterns['line'],
2841 'text': r''}
2842 initial_transitions = [('underline', 'Body'), ('text', 'Body')]
2843
2844 def blank(self, match, context, next_state):
2845 """End of paragraph."""
2846 # NOTE: self.paragraph returns [node, system_message(s)], literalnext
2847 paragraph, literalnext = self.paragraph(
2848 context, self.state_machine.abs_line_number() - 1)
2849 self.parent += paragraph
2850 if literalnext:
2851 self.parent += self.literal_block()
2852 return [], 'Body', []
2853
2854 def eof(self, context):
2855 if context:
2856 self.blank(None, context, None)
2857 return []
2858
2859 def indent(self, match, context, next_state):
2860 """Definition list item."""
2861 dl = nodes.definition_list()
2862 # the definition list starts on the line before the indent:
2863 lineno = self.state_machine.abs_line_number() - 1
2864 dl.source, dl.line = self.state_machine.get_source_and_line(lineno)
2865 dl_item, blank_finish = self.definition_list_item(context)
2866 dl += dl_item
2867 self.parent += dl
2868 offset = self.state_machine.line_offset + 1 # next line
2869 newline_offset, blank_finish = self.nested_list_parse(
2870 self.state_machine.input_lines[offset:],
2871 input_offset=self.state_machine.abs_line_offset() + 1,
2872 node=dl, initial_state='DefinitionList',
2873 blank_finish=blank_finish, blank_finish_state='Definition')
2874 self.goto_line(newline_offset)
2875 if not blank_finish:
2876 self.parent += self.unindent_warning('Definition list')
2877 return [], 'Body', []
2878
2879 def underline(self, match, context, next_state):
2880 """Section title."""
2881 lineno = self.state_machine.abs_line_number()
2882 title = context[0].rstrip()
2883 underline = match.string.rstrip()
2884 source = title + '\n' + underline
2885 messages = []
2886 if column_width(title) > len(underline):
2887 if len(underline) < 4:
2888 if self.state_machine.match_titles:
2889 msg = self.reporter.info(
2890 'Possible title underline, too short for the title.\n'
2891 "Treating it as ordinary text because it's so short.",
2892 line=lineno)
2893 self.parent += msg
2894 raise statemachine.TransitionCorrection('text')
2895 else:
2896 blocktext = context[0] + '\n' + self.state_machine.line
2897 msg = self.reporter.warning(
2898 'Title underline too short.',
2899 nodes.literal_block(blocktext, blocktext),
2900 line=lineno)
2901 messages.append(msg)
2902 if not self.state_machine.match_titles:
2903 blocktext = context[0] + '\n' + self.state_machine.line
2904 # We need get_source_and_line() here to report correctly
2905 src, srcline = self.state_machine.get_source_and_line()
2906 # TODO: why is abs_line_number() == srcline+1
2907 # if the error is in a table (try with test_tables.py)?
2908 # print("get_source_and_line", srcline)
2909 # print("abs_line_number", self.state_machine.abs_line_number())
2910 msg = self.reporter.error(
2911 'Unexpected section title.',
2912 nodes.literal_block(blocktext, blocktext),
2913 source=src, line=srcline)
2914 self.parent += messages
2915 self.parent += msg
2916 return [], next_state, []
2917 style = underline[0]
2918 context[:] = []
2919 self.section(title, source, style, lineno - 1, messages)
2920 return [], next_state, []
2921
2922 def text(self, match, context, next_state):
2923 """Paragraph."""
2924 startline = self.state_machine.abs_line_number() - 1
2925 msg = None
2926 try:
2927 block = self.state_machine.get_text_block(flush_left=True)
2928 except statemachine.UnexpectedIndentationError as err:
2929 block, src, srcline = err.args
2930 msg = self.reporter.error('Unexpected indentation.',
2931 source=src, line=srcline)
2932 lines = context + list(block)
2933 paragraph, literalnext = self.paragraph(lines, startline)
2934 self.parent += paragraph
2935 self.parent += msg
2936 if literalnext:
2937 try:
2938 self.state_machine.next_line()
2939 except EOFError:
2940 pass
2941 self.parent += self.literal_block()
2942 return [], next_state, []
2943
2944 def literal_block(self):
2945 """Return a list of nodes."""
2946 (indented, indent, offset, blank_finish
2947 ) = self.state_machine.get_indented()
2948 while indented and not indented[-1].strip():
2949 indented.trim_end()
2950 if not indented:
2951 return self.quoted_literal_block()
2952 data = '\n'.join(indented)
2953 literal_block = nodes.literal_block(data, data)
2954 (literal_block.source,
2955 literal_block.line) = self.state_machine.get_source_and_line(offset+1)
2956 nodelist = [literal_block]
2957 if not blank_finish:
2958 nodelist.append(self.unindent_warning('Literal block'))
2959 return nodelist
2960
2961 def quoted_literal_block(self):
2962 abs_line_offset = self.state_machine.abs_line_offset()
2963 offset = self.state_machine.line_offset
2964 parent_node = nodes.Element()
2965 new_abs_offset = self.nested_parse(
2966 self.state_machine.input_lines[offset:],
2967 input_offset=abs_line_offset, node=parent_node, match_titles=False,
2968 state_machine_kwargs={'state_classes': (QuotedLiteralBlock,),
2969 'initial_state': 'QuotedLiteralBlock'})
2970 self.goto_line(new_abs_offset)
2971 return parent_node.children
2972
2973 def definition_list_item(self, termline):
2974 # the parser is already on the second (indented) line:
2975 dd_lineno = self.state_machine.abs_line_number()
2976 dt_lineno = dd_lineno - 1
2977 (indented, indent, line_offset, blank_finish
2978 ) = self.state_machine.get_indented()
2979 dl_item = nodes.definition_list_item(
2980 '\n'.join(termline + list(indented)))
2981 (dl_item.source,
2982 dl_item.line) = self.state_machine.get_source_and_line(dt_lineno)
2983 dt_nodes, messages = self.term(termline, dt_lineno)
2984 dl_item += dt_nodes
2985 dd = nodes.definition('', *messages)
2986 dd.source, dd.line = self.state_machine.get_source_and_line(dd_lineno)
2987 dl_item += dd
2988 if termline[0][-2:] == '::':
2989 dd += self.reporter.info(
2990 'Blank line missing before literal block (after the "::")? '
2991 'Interpreted as a definition list item.',
2992 line=dd_lineno)
2993 # TODO: drop a definition if it is an empty comment to allow
2994 # definition list items with several terms?
2995 # https://sourceforge.net/p/docutils/feature-requests/60/
2996 self.nested_parse(indented, input_offset=line_offset, node=dd)
2997 return dl_item, blank_finish
2998
2999 classifier_delimiter = re.compile(' +: +')
3000
3001 def term(self, lines, lineno):
3002 """Return a definition_list's term and optional classifiers."""
3003 assert len(lines) == 1
3004 text_nodes, messages = self.inline_text(lines[0], lineno)
3005 dt = nodes.term(lines[0])
3006 dt.source, dt.line = self.state_machine.get_source_and_line(lineno)
3007 node_list = [dt]
3008 for i in range(len(text_nodes)):
3009 node = text_nodes[i]
3010 if isinstance(node, nodes.Text):
3011 parts = self.classifier_delimiter.split(node)
3012 if len(parts) == 1:
3013 node_list[-1] += node
3014 else:
3015 text = parts[0].rstrip()
3016 textnode = nodes.Text(text)
3017 node_list[-1] += textnode
3018 node_list += [nodes.classifier(unescape(part, True), part)
3019 for part in parts[1:]]
3020 else:
3021 node_list[-1] += node
3022 return node_list, messages
3023
3024
3025class SpecializedText(Text):
3026
3027 """
3028 Superclass for second and subsequent lines of Text-variants.
3029
3030 All transition methods are disabled. Override individual methods in
3031 subclasses to re-enable.
3032 """
3033
3034 def eof(self, context):
3035 """Incomplete construct."""
3036 return []
3037
3038 def invalid_input(self, match=None, context=None, next_state=None):
3039 """Not a compound element member. Abort this state machine."""
3040 raise EOFError
3041
3042 blank = invalid_input
3043 indent = invalid_input
3044 underline = invalid_input
3045 text = invalid_input
3046
3047
3048class Definition(SpecializedText):
3049
3050 """Second line of potential definition_list_item."""
3051
3052 def eof(self, context):
3053 """Not a definition."""
3054 self.state_machine.previous_line(2) # so parent SM can reassess
3055 return []
3056
3057 def indent(self, match, context, next_state):
3058 """Definition list item."""
3059 dl_item, blank_finish = self.definition_list_item(context)
3060 self.parent += dl_item
3061 self.blank_finish = blank_finish
3062 return [], 'DefinitionList', []
3063
3064
3065class Line(SpecializedText):
3066
3067 """
3068 Second line of over- & underlined section title or transition marker.
3069 """
3070
3071 eofcheck = 1 # ignored, will be removed in Docutils 2.0.
3072
3073 def eof(self, context):
3074 """Transition marker at end of section or document."""
3075 marker = context[0].strip()
3076 if len(marker) < 4:
3077 self.state_correction(context)
3078 src, srcline = self.state_machine.get_source_and_line()
3079 # lineno = self.state_machine.abs_line_number() - 1
3080 transition = nodes.transition(rawsource=context[0])
3081 transition.source = src
3082 transition.line = srcline - 1
3083 # transition.line = lineno
3084 self.parent += transition
3085 return []
3086
3087 def blank(self, match, context, next_state):
3088 """Transition marker."""
3089 src, srcline = self.state_machine.get_source_and_line()
3090 marker = context[0].strip()
3091 if len(marker) < 4:
3092 self.state_correction(context)
3093 transition = nodes.transition(rawsource=marker)
3094 transition.source = src
3095 transition.line = srcline - 1
3096 self.parent += transition
3097 return [], 'Body', []
3098
3099 def text(self, match, context, next_state):
3100 """Potential over- & underlined title."""
3101 lineno = self.state_machine.abs_line_number() - 1
3102 overline = context[0]
3103 title = match.string
3104 underline = ''
3105 try:
3106 underline = self.state_machine.next_line()
3107 except EOFError:
3108 blocktext = overline + '\n' + title
3109 if len(overline.rstrip()) < 4:
3110 self.short_overline(context, blocktext, lineno, 2)
3111 else:
3112 msg = self.reporter.error(
3113 'Incomplete section title.',
3114 nodes.literal_block(blocktext, blocktext),
3115 line=lineno)
3116 self.parent += msg
3117 return [], 'Body', []
3118 source = '%s\n%s\n%s' % (overline, title, underline)
3119 overline = overline.rstrip()
3120 underline = underline.rstrip()
3121 if not self.transitions['underline'][0].match(underline):
3122 blocktext = overline + '\n' + title + '\n' + underline
3123 if len(overline.rstrip()) < 4:
3124 self.short_overline(context, blocktext, lineno, 2)
3125 else:
3126 msg = self.reporter.error(
3127 'Missing matching underline for section title overline.',
3128 nodes.literal_block(source, source),
3129 line=lineno)
3130 self.parent += msg
3131 return [], 'Body', []
3132 elif overline != underline:
3133 blocktext = overline + '\n' + title + '\n' + underline
3134 if len(overline.rstrip()) < 4:
3135 self.short_overline(context, blocktext, lineno, 2)
3136 else:
3137 msg = self.reporter.error(
3138 'Title overline & underline mismatch.',
3139 nodes.literal_block(source, source),
3140 line=lineno)
3141 self.parent += msg
3142 return [], 'Body', []
3143 title = title.rstrip()
3144 messages = []
3145 if column_width(title) > len(overline):
3146 blocktext = overline + '\n' + title + '\n' + underline
3147 if len(overline.rstrip()) < 4:
3148 self.short_overline(context, blocktext, lineno, 2)
3149 else:
3150 msg = self.reporter.warning(
3151 'Title overline too short.',
3152 nodes.literal_block(source, source),
3153 line=lineno)
3154 messages.append(msg)
3155 style = (overline[0], underline[0])
3156 self.section(title.lstrip(), source, style, lineno + 1, messages)
3157 return [], 'Body', []
3158
3159 indent = text # indented title
3160
3161 def underline(self, match, context, next_state):
3162 overline = context[0]
3163 blocktext = overline + '\n' + self.state_machine.line
3164 lineno = self.state_machine.abs_line_number() - 1
3165 if len(overline.rstrip()) < 4:
3166 self.short_overline(context, blocktext, lineno, 1)
3167 msg = self.reporter.error(
3168 'Invalid section title or transition marker.',
3169 nodes.literal_block(blocktext, blocktext),
3170 line=lineno)
3171 self.parent += msg
3172 return [], 'Body', []
3173
3174 def short_overline(self, context, blocktext, lineno, lines=1) -> None:
3175 msg = self.reporter.info(
3176 'Possible incomplete section title.\nTreating the overline as '
3177 "ordinary text because it's so short.",
3178 line=lineno)
3179 self.parent += msg
3180 self.state_correction(context, lines)
3181
3182 def state_correction(self, context, lines=1):
3183 self.state_machine.previous_line(lines)
3184 context[:] = []
3185 raise statemachine.StateCorrection('Body', 'text')
3186
3187
3188class QuotedLiteralBlock(RSTState):
3189
3190 """
3191 Nested parse handler for quoted (unindented) literal blocks.
3192
3193 Special-purpose. Not for inclusion in `state_classes`.
3194 """
3195
3196 patterns = {'initial_quoted': r'(%(nonalphanum7bit)s)' % Body.pats,
3197 'text': r''}
3198 initial_transitions = ('initial_quoted', 'text')
3199
3200 def __init__(self, state_machine, debug=False) -> None:
3201 RSTState.__init__(self, state_machine, debug)
3202 self.messages = []
3203 self.initial_lineno = None
3204
3205 def blank(self, match, context, next_state):
3206 if context:
3207 raise EOFError
3208 else:
3209 return context, next_state, []
3210
3211 def eof(self, context):
3212 if context:
3213 src, srcline = self.state_machine.get_source_and_line(
3214 self.initial_lineno)
3215 text = '\n'.join(context)
3216 literal_block = nodes.literal_block(text, text)
3217 literal_block.source = src
3218 literal_block.line = srcline
3219 self.parent += literal_block
3220 else:
3221 self.parent += self.reporter.warning(
3222 'Literal block expected; none found.',
3223 line=self.state_machine.abs_line_number()
3224 ) # src not available, statemachine.input_lines is empty
3225 self.state_machine.previous_line()
3226 self.parent += self.messages
3227 return []
3228
3229 def indent(self, match, context, next_state):
3230 assert context, ('QuotedLiteralBlock.indent: context should not '
3231 'be empty!')
3232 self.messages.append(
3233 self.reporter.error('Unexpected indentation.',
3234 line=self.state_machine.abs_line_number()))
3235 self.state_machine.previous_line()
3236 raise EOFError
3237
3238 def initial_quoted(self, match, context, next_state):
3239 """Match arbitrary quote character on the first line only."""
3240 self.remove_transition('initial_quoted')
3241 quote = match.string[0]
3242 pattern = re.compile(re.escape(quote))
3243 # New transition matches consistent quotes only:
3244 self.add_transition('quoted',
3245 (pattern, self.quoted, self.__class__.__name__))
3246 self.initial_lineno = self.state_machine.abs_line_number()
3247 return [match.string], next_state, []
3248
3249 def quoted(self, match, context, next_state):
3250 """Match consistent quotes on subsequent lines."""
3251 context.append(match.string)
3252 return context, next_state, []
3253
3254 def text(self, match, context, next_state):
3255 if context:
3256 self.messages.append(
3257 self.reporter.error('Inconsistent literal block quoting.',
3258 line=self.state_machine.abs_line_number()))
3259 self.state_machine.previous_line()
3260 raise EOFError
3261
3262
3263state_classes = (Body, BulletList, DefinitionList, EnumeratedList, FieldList,
3264 OptionList, LineBlock, ExtensionOptions, Explicit, Text,
3265 Definition, Line, SubstitutionDef, RFC2822Body, RFC2822List)
3266"""Standard set of State classes used to start `RSTStateMachine`."""