1# $Id$
2# Author: David Goodger <goodger@python.org>
3# Copyright: This module has been placed in the public domain.
4
5"""
6This is the ``docutils.parsers.rst.states`` module, the core of
7the reStructuredText parser. It defines the following:
8
9:Classes:
10 - `RSTStateMachine`: reStructuredText parser's entry point.
11 - `NestedStateMachine`: recursive StateMachine.
12 - `RSTState`: reStructuredText State superclass.
13 - `Inliner`: For parsing inline markup.
14 - `Body`: Generic classifier of the first line of a block.
15 - `SpecializedBody`: Superclass for compound element members.
16 - `BulletList`: Second and subsequent bullet_list list_items
17 - `DefinitionList`: Second+ definition_list_items.
18 - `EnumeratedList`: Second+ enumerated_list list_items.
19 - `FieldList`: Second+ fields.
20 - `OptionList`: Second+ option_list_items.
21 - `RFC2822List`: Second+ RFC2822-style fields.
22 - `ExtensionOptions`: Parses directive option fields.
23 - `Explicit`: Second+ explicit markup constructs.
24 - `SubstitutionDef`: For embedded directives in substitution definitions.
25 - `Text`: Classifier of second line of a text block.
26 - `SpecializedText`: Superclass for continuation lines of Text-variants.
27 - `Definition`: Second line of potential definition_list_item.
28 - `Line`: Second line of overlined section title or transition marker.
29 - `Struct`: An auxiliary collection class.
30
31:Exception classes:
32 - `MarkupError`
33 - `ParserError`
34 - `MarkupMismatch`
35
36:Functions:
37 - `escape2null()`: Return a string, escape-backslashes converted to nulls.
38 - `unescape()`: Return a string, nulls removed or restored to backslashes.
39
40:Attributes:
41 - `state_classes`: set of State classes used with `RSTStateMachine`.
42
43Parser Overview
44===============
45
46The reStructuredText parser is implemented as a recursive state machine,
47examining its input one line at a time. To understand how the parser works,
48please first become familiar with the `docutils.statemachine` module. In the
49description below, references are made to classes defined in this module;
50please see the individual classes for details.
51
52Parsing proceeds as follows:
53
541. The state machine examines each line of input, checking each of the
55 transition patterns of the state `Body`, in order, looking for a match.
56 The implicit transitions (blank lines and indentation) are checked before
57 any others. The 'text' transition is a catch-all (matches anything).
58
592. The method associated with the matched transition pattern is called.
60
61 A. Some transition methods are self-contained, appending elements to the
62 document tree (`Body.doctest` parses a doctest block). The parser's
63 current line index is advanced to the end of the element, and parsing
64 continues with step 1.
65
66 B. Other transition methods trigger the creation of a nested state machine,
67 whose job is to parse a compound construct ('indent' does a block quote,
68 'bullet' does a bullet list, 'overline' does a section [first checking
69 for a valid section header], etc.).
70
71 - In the case of lists and explicit markup, a one-off state machine is
72 created and run to parse contents of the first item.
73
74 - A new state machine is created and its initial state is set to the
75 appropriate specialized state (`BulletList` in the case of the
76 'bullet' transition; see `SpecializedBody` for more detail). This
77 state machine is run to parse the compound element (or series of
78 explicit markup elements), and returns as soon as a non-member element
79 is encountered. For example, the `BulletList` state machine ends as
80 soon as it encounters an element which is not a list item of that
81 bullet list. The optional omission of inter-element blank lines is
82 enabled by this nested state machine.
83
84 - The current line index is advanced to the end of the elements parsed,
85 and parsing continues with step 1.
86
87 C. The result of the 'text' transition depends on the next line of text.
88 The current state is changed to `Text`, under which the second line is
89 examined. If the second line is:
90
91 - Indented: The element is a definition list item, and parsing proceeds
92 similarly to step 2.B, using the `DefinitionList` state.
93
94 - A line of uniform punctuation characters: The element is a section
95 header; again, parsing proceeds as in step 2.B, and `Body` is still
96 used.
97
98 - Anything else: The element is a paragraph, which is examined for
99 inline markup and appended to the parent element. Processing
100 continues with step 1.
101"""
102
103__docformat__ = 'reStructuredText'
104
105
106import re
107from types import FunctionType, MethodType
108
109from docutils import nodes, statemachine, utils
110from docutils import ApplicationError, DataError
111from docutils.statemachine import StateMachineWS, StateWS
112from docutils.nodes import fully_normalize_name as normalize_name
113from docutils.nodes import unescape, whitespace_normalize_name
114import docutils.parsers.rst
115from docutils.parsers.rst import directives, languages, tableparser, roles
116from docutils.utils import escape2null, column_width
117from docutils.utils import punctuation_chars, urischemes
118from docutils.utils import split_escaped_whitespace
119from docutils.utils._roman_numerals import (
120 InvalidRomanNumeralError,
121 RomanNumeral,
122)
123
124
125class MarkupError(DataError): pass
126class UnknownInterpretedRoleError(DataError): pass
127class InterpretedRoleNotImplementedError(DataError): pass
128class ParserError(ApplicationError): pass
129class MarkupMismatch(Exception): pass
130
131
132class Struct:
133
134 """Stores data attributes for dotted-attribute access."""
135
136 def __init__(self, **keywordargs) -> None:
137 self.__dict__.update(keywordargs)
138
139
140class RSTStateMachine(StateMachineWS):
141
142 """
143 reStructuredText's master StateMachine.
144
145 The entry point to reStructuredText parsing is the `run()` method.
146 """
147
148 def run(self, input_lines, document, input_offset=0, match_titles=True,
149 inliner=None) -> None:
150 """
151 Parse `input_lines` and modify the `document` node in place.
152
153 Extend `StateMachineWS.run()`: set up parse-global data and
154 run the StateMachine.
155 """
156 self.language = languages.get_language(
157 document.settings.language_code, document.reporter)
158 self.match_titles = match_titles
159 if inliner is None:
160 inliner = Inliner()
161 inliner.init_customizations(document.settings)
162 self.memo = Struct(document=document,
163 reporter=document.reporter,
164 language=self.language,
165 title_styles=[],
166 section_level=0,
167 section_bubble_up_kludge=False,
168 inliner=inliner)
169 self.document = document
170 self.attach_observer(document.note_source)
171 self.reporter = self.memo.reporter
172 self.node = document
173 results = StateMachineWS.run(self, input_lines, input_offset,
174 input_source=document['source'])
175 assert results == [], 'RSTStateMachine.run() results should be empty!'
176 self.node = self.memo = None # remove unneeded references
177
178
179class NestedStateMachine(StateMachineWS):
180
181 """
182 StateMachine run from within other StateMachine runs, to parse nested
183 document structures.
184 """
185
186 def run(self, input_lines, input_offset, memo, node, match_titles=True):
187 """
188 Parse `input_lines` and populate a `docutils.nodes.document` instance.
189
190 Extend `StateMachineWS.run()`: set up document-wide data.
191 """
192 self.match_titles = match_titles
193 self.memo = memo
194 self.document = memo.document
195 self.attach_observer(self.document.note_source)
196 self.reporter = memo.reporter
197 self.language = memo.language
198 self.node = node
199 results = StateMachineWS.run(self, input_lines, input_offset)
200 assert results == [], ('NestedStateMachine.run() results should be '
201 'empty!')
202 return results
203
204
205class RSTState(StateWS):
206
207 """
208 reStructuredText State superclass.
209
210 Contains methods used by all State subclasses.
211 """
212
213 nested_sm = NestedStateMachine
214 nested_sm_cache = []
215
216 def __init__(self, state_machine, debug=False) -> None:
217 self.nested_sm_kwargs = {'state_classes': state_classes,
218 'initial_state': 'Body'}
219 StateWS.__init__(self, state_machine, debug)
220
221 def runtime_init(self) -> None:
222 StateWS.runtime_init(self)
223 memo = self.state_machine.memo
224 self.memo = memo
225 self.reporter = memo.reporter
226 self.inliner = memo.inliner
227 self.document = memo.document
228 self.parent = self.state_machine.node
229 # enable the reporter to determine source and source-line
230 if not hasattr(self.reporter, 'get_source_and_line'):
231 self.reporter.get_source_and_line = self.state_machine.get_source_and_line # noqa:E501
232
233 def goto_line(self, abs_line_offset) -> None:
234 """
235 Jump to input line `abs_line_offset`, ignoring jumps past the end.
236 """
237 try:
238 self.state_machine.goto_line(abs_line_offset)
239 except EOFError:
240 pass
241
242 def no_match(self, context, transitions):
243 """
244 Override `StateWS.no_match` to generate a system message.
245
246 This code should never be run.
247 """
248 self.reporter.severe(
249 'Internal error: no transition pattern match. State: "%s"; '
250 'transitions: %s; context: %s; current line: %r.'
251 % (self.__class__.__name__, transitions, context,
252 self.state_machine.line))
253 return context, None, []
254
255 def bof(self, context):
256 """Called at beginning of file."""
257 return [], []
258
259 def nested_parse(self, block, input_offset, node, match_titles=False,
260 state_machine_class=None, state_machine_kwargs=None):
261 """
262 Create a new StateMachine rooted at `node` and run it over the input
263 `block`.
264 """
265 use_default = 0
266 if state_machine_class is None:
267 state_machine_class = self.nested_sm
268 use_default += 1
269 if state_machine_kwargs is None:
270 state_machine_kwargs = self.nested_sm_kwargs
271 use_default += 1
272 block_length = len(block)
273
274 state_machine = None
275 if use_default == 2:
276 try:
277 state_machine = self.nested_sm_cache.pop()
278 except IndexError:
279 pass
280 if not state_machine:
281 state_machine = state_machine_class(debug=self.debug,
282 **state_machine_kwargs)
283 state_machine.run(block, input_offset, memo=self.memo,
284 node=node, match_titles=match_titles)
285 if use_default == 2:
286 self.nested_sm_cache.append(state_machine)
287 else:
288 state_machine.unlink()
289 new_offset = state_machine.abs_line_offset()
290 # No `block.parent` implies disconnected -- lines aren't in sync:
291 if block.parent and (len(block) - block_length) != 0:
292 # Adjustment for block if modified in nested parse:
293 self.state_machine.next_line(len(block) - block_length)
294 return new_offset
295
296 def nested_list_parse(self, block, input_offset, node, initial_state,
297 blank_finish,
298 blank_finish_state=None,
299 extra_settings={},
300 match_titles=False,
301 state_machine_class=None,
302 state_machine_kwargs=None):
303 """
304 Create a new StateMachine rooted at `node` and run it over the input
305 `block`. Also keep track of optional intermediate blank lines and the
306 required final one.
307 """
308 if state_machine_class is None:
309 state_machine_class = self.nested_sm
310 if state_machine_kwargs is None:
311 state_machine_kwargs = self.nested_sm_kwargs.copy()
312 state_machine_kwargs['initial_state'] = initial_state
313 state_machine = state_machine_class(debug=self.debug,
314 **state_machine_kwargs)
315 if blank_finish_state is None:
316 blank_finish_state = initial_state
317 state_machine.states[blank_finish_state].blank_finish = blank_finish
318 for key, value in extra_settings.items():
319 setattr(state_machine.states[initial_state], key, value)
320 state_machine.run(block, input_offset, memo=self.memo,
321 node=node, match_titles=match_titles)
322 blank_finish = state_machine.states[blank_finish_state].blank_finish
323 state_machine.unlink()
324 return state_machine.abs_line_offset(), blank_finish
325
326 def section(self, title, source, style, lineno, messages) -> None:
327 """Check for a valid subsection and create one if it checks out."""
328 if self.check_subsection(source, style, lineno):
329 self.new_subsection(title, lineno, messages)
330
331 def check_subsection(self, source, style, lineno) -> bool:
332 """
333 Check for a valid subsection header. Return True or False.
334
335 When a new section is reached that isn't a subsection of the current
336 section, back up the line count (use ``previous_line(-x)``), then
337 ``raise EOFError``. The current StateMachine will finish, then the
338 calling StateMachine can re-examine the title. This will work its way
339 back up the calling chain until the correct section level isreached.
340
341 @@@ Alternative: Evaluate the title, store the title info & level, and
342 back up the chain until that level is reached. Store in memo? Or
343 return in results?
344
345 :Exception: `EOFError` when a sibling or supersection encountered.
346 """
347 memo = self.memo
348 title_styles = memo.title_styles
349 mylevel = memo.section_level
350 try: # check for existing title style
351 level = title_styles.index(style) + 1
352 except ValueError: # new title style
353 if len(title_styles) == memo.section_level: # new subsection
354 title_styles.append(style)
355 return True
356 else: # not at lowest level
357 self.parent += self.title_inconsistent(source, lineno)
358 return False
359 if level <= mylevel: # sibling or supersection
360 memo.section_level = level # bubble up to parent section
361 if len(style) == 2:
362 memo.section_bubble_up_kludge = True
363 # back up 2 lines for underline title, 3 for overline title
364 self.state_machine.previous_line(len(style) + 1)
365 raise EOFError # let parent section re-evaluate
366 if level == mylevel + 1: # immediate subsection
367 return True
368 else: # invalid subsection
369 self.parent += self.title_inconsistent(source, lineno)
370 return False
371
372 def title_inconsistent(self, sourcetext, lineno):
373 error = self.reporter.severe(
374 'Title level inconsistent:', nodes.literal_block('', sourcetext),
375 line=lineno)
376 return error
377
378 def new_subsection(self, title, lineno, messages):
379 """Append new subsection to document tree. On return, check level."""
380 memo = self.memo
381 mylevel = memo.section_level
382 memo.section_level += 1
383 section_node = nodes.section()
384 self.parent += section_node
385 textnodes, title_messages = self.inline_text(title, lineno)
386 titlenode = nodes.title(title, '', *textnodes)
387 name = normalize_name(titlenode.astext())
388 section_node['names'].append(name)
389 section_node += titlenode
390 section_node += messages
391 section_node += title_messages
392 self.document.note_implicit_target(section_node, section_node)
393 offset = self.state_machine.line_offset + 1
394 absoffset = self.state_machine.abs_line_offset() + 1
395 newabsoffset = self.nested_parse(
396 self.state_machine.input_lines[offset:], input_offset=absoffset,
397 node=section_node, match_titles=True)
398 self.goto_line(newabsoffset)
399 if memo.section_level <= mylevel: # can't handle next section?
400 raise EOFError # bubble up to supersection
401 # reset section_level; next pass will detect it properly
402 memo.section_level = mylevel
403
404 def paragraph(self, lines, lineno):
405 """
406 Return a list (paragraph & messages) & a boolean: literal_block next?
407 """
408 data = '\n'.join(lines).rstrip()
409 if re.search(r'(?<!\\)(\\\\)*::$', data):
410 if len(data) == 2:
411 return [], 1
412 elif data[-3] in ' \n':
413 text = data[:-3].rstrip()
414 else:
415 text = data[:-1]
416 literalnext = 1
417 else:
418 text = data
419 literalnext = 0
420 textnodes, messages = self.inline_text(text, lineno)
421 p = nodes.paragraph(data, '', *textnodes)
422 p.source, p.line = self.state_machine.get_source_and_line(lineno)
423 return [p] + messages, literalnext
424
425 def inline_text(self, text, lineno):
426 """
427 Return 2 lists: nodes (text and inline elements), and system_messages.
428 """
429 nodes, messages = self.inliner.parse(text, lineno,
430 self.memo, self.parent)
431 return nodes, messages
432
433 def unindent_warning(self, node_name):
434 # the actual problem is one line below the current line
435 lineno = self.state_machine.abs_line_number() + 1
436 return self.reporter.warning('%s ends without a blank line; '
437 'unexpected unindent.' % node_name,
438 line=lineno)
439
440
441def build_regexp(definition, compile_patterns=True):
442 """
443 Build, compile and return a regular expression based on `definition`.
444
445 :Parameter: `definition`: a 4-tuple (group name, prefix, suffix, parts),
446 where "parts" is a list of regular expressions and/or regular
447 expression definitions to be joined into an or-group.
448 """
449 name, prefix, suffix, parts = definition
450 part_strings = []
451 for part in parts:
452 if isinstance(part, tuple):
453 part_strings.append(build_regexp(part, None))
454 else:
455 part_strings.append(part)
456 or_group = '|'.join(part_strings)
457 regexp = '%(prefix)s(?P<%(name)s>%(or_group)s)%(suffix)s' % locals()
458 if compile_patterns:
459 return re.compile(regexp)
460 else:
461 return regexp
462
463
464class Inliner:
465
466 """
467 Parse inline markup; call the `parse()` method.
468 """
469
470 def __init__(self) -> None:
471 self.implicit_dispatch = []
472 """List of (pattern, bound method) tuples, used by
473 `self.implicit_inline`."""
474
475 def init_customizations(self, settings) -> None:
476 # lookahead and look-behind expressions for inline markup rules
477 if getattr(settings, 'character_level_inline_markup', False):
478 start_string_prefix = '(^|(?<!\x00))'
479 end_string_suffix = ''
480 else:
481 start_string_prefix = ('(^|(?<=\\s|[%s%s]))' %
482 (punctuation_chars.openers,
483 punctuation_chars.delimiters))
484 end_string_suffix = ('($|(?=\\s|[\x00%s%s%s]))' %
485 (punctuation_chars.closing_delimiters,
486 punctuation_chars.delimiters,
487 punctuation_chars.closers))
488 args = locals().copy()
489 args.update(vars(self.__class__))
490
491 parts = ('initial_inline', start_string_prefix, '',
492 [
493 ('start', '', self.non_whitespace_after, # simple start-strings
494 [r'\*\*', # strong
495 r'\*(?!\*)', # emphasis but not strong
496 r'``', # literal
497 r'_`', # inline internal target
498 r'\|(?!\|)'] # substitution reference
499 ),
500 ('whole', '', end_string_suffix, # whole constructs
501 [ # reference name & end-string
502 r'(?P<refname>%s)(?P<refend>__?)' % self.simplename,
503 ('footnotelabel', r'\[', r'(?P<fnend>\]_)',
504 [r'[0-9]+', # manually numbered
505 r'\#(%s)?' % self.simplename, # auto-numbered (w/ label?)
506 r'\*', # auto-symbol
507 r'(?P<citationlabel>%s)' % self.simplename, # citation ref
508 ]
509 )
510 ]
511 ),
512 ('backquote', # interpreted text or phrase reference
513 '(?P<role>(:%s:)?)' % self.simplename, # optional role
514 self.non_whitespace_after,
515 ['`(?!`)'] # but not literal
516 )
517 ]
518 )
519 self.start_string_prefix = start_string_prefix
520 self.end_string_suffix = end_string_suffix
521 self.parts = parts
522
523 self.patterns = Struct(
524 initial=build_regexp(parts),
525 emphasis=re.compile(self.non_whitespace_escape_before
526 + r'(\*)' + end_string_suffix),
527 strong=re.compile(self.non_whitespace_escape_before
528 + r'(\*\*)' + end_string_suffix),
529 interpreted_or_phrase_ref=re.compile(
530 r"""
531 %(non_unescaped_whitespace_escape_before)s
532 (
533 `
534 (?P<suffix>
535 (?P<role>:%(simplename)s:)?
536 (?P<refend>__?)?
537 )
538 )
539 %(end_string_suffix)s
540 """ % args, re.VERBOSE),
541 embedded_link=re.compile(
542 r"""
543 (
544 (?:[ \n]+|^) # spaces or beginning of line/string
545 < # open bracket
546 %(non_whitespace_after)s
547 (([^<>]|\x00[<>])+) # anything but unescaped angle brackets
548 %(non_whitespace_escape_before)s
549 > # close bracket
550 )
551 $ # end of string
552 """ % args, re.VERBOSE),
553 literal=re.compile(self.non_whitespace_before + '(``)'
554 + end_string_suffix),
555 target=re.compile(self.non_whitespace_escape_before
556 + r'(`)' + end_string_suffix),
557 substitution_ref=re.compile(self.non_whitespace_escape_before
558 + r'(\|_{0,2})'
559 + end_string_suffix),
560 email=re.compile(self.email_pattern % args + '$',
561 re.VERBOSE),
562 uri=re.compile(
563 (r"""
564 %(start_string_prefix)s
565 (?P<whole>
566 (?P<absolute> # absolute URI
567 (?P<scheme> # scheme (http, ftp, mailto)
568 [a-zA-Z][a-zA-Z0-9.+-]*
569 )
570 :
571 (
572 ( # either:
573 (//?)? # hierarchical URI
574 %(uric)s* # URI characters
575 %(uri_end)s # final URI char
576 )
577 ( # optional query
578 \?%(uric)s*
579 %(uri_end)s
580 )?
581 ( # optional fragment
582 \#%(uric)s*
583 %(uri_end)s
584 )?
585 )
586 )
587 | # *OR*
588 (?P<email> # email address
589 """ + self.email_pattern + r"""
590 )
591 )
592 %(end_string_suffix)s
593 """) % args, re.VERBOSE),
594 pep=re.compile(
595 r"""
596 %(start_string_prefix)s
597 (
598 (pep-(?P<pepnum1>\d+)(.txt)?) # reference to source file
599 |
600 (PEP\s+(?P<pepnum2>\d+)) # reference by name
601 )
602 %(end_string_suffix)s""" % args, re.VERBOSE),
603 rfc=re.compile(
604 r"""
605 %(start_string_prefix)s
606 (RFC(-|\s+)?(?P<rfcnum>\d+))
607 %(end_string_suffix)s""" % args, re.VERBOSE))
608
609 self.implicit_dispatch.append((self.patterns.uri,
610 self.standalone_uri))
611 if settings.pep_references:
612 self.implicit_dispatch.append((self.patterns.pep,
613 self.pep_reference))
614 if settings.rfc_references:
615 self.implicit_dispatch.append((self.patterns.rfc,
616 self.rfc_reference))
617
618 def parse(self, text, lineno, memo, parent):
619 # Needs to be refactored for nested inline markup.
620 # Add nested_parse() method?
621 """
622 Return 2 lists: nodes (text and inline elements), and system_messages.
623
624 Using `self.patterns.initial`, a pattern which matches start-strings
625 (emphasis, strong, interpreted, phrase reference, literal,
626 substitution reference, and inline target) and complete constructs
627 (simple reference, footnote reference), search for a candidate. When
628 one is found, check for validity (e.g., not a quoted '*' character).
629 If valid, search for the corresponding end string if applicable, and
630 check it for validity. If not found or invalid, generate a warning
631 and ignore the start-string. Implicit inline markup (e.g. standalone
632 URIs) is found last.
633
634 :text: source string
635 :lineno: absolute line number, cf. `statemachine.get_source_and_line()`
636 """
637 self.reporter = memo.reporter
638 self.document = memo.document
639 self.language = memo.language
640 self.parent = parent
641 pattern_search = self.patterns.initial.search
642 dispatch = self.dispatch
643 remaining = escape2null(text)
644 processed = []
645 unprocessed = []
646 messages = []
647 while remaining:
648 match = pattern_search(remaining)
649 if match:
650 groups = match.groupdict()
651 method = dispatch[groups['start'] or groups['backquote']
652 or groups['refend'] or groups['fnend']]
653 before, inlines, remaining, sysmessages = method(self, match,
654 lineno)
655 unprocessed.append(before)
656 messages += sysmessages
657 if inlines:
658 processed += self.implicit_inline(''.join(unprocessed),
659 lineno)
660 processed += inlines
661 unprocessed = []
662 else:
663 break
664 remaining = ''.join(unprocessed) + remaining
665 if remaining:
666 processed += self.implicit_inline(remaining, lineno)
667 return processed, messages
668
669 # Inline object recognition
670 # -------------------------
671 # See also init_customizations().
672 non_whitespace_before = r'(?<!\s)'
673 non_whitespace_escape_before = r'(?<![\s\x00])'
674 non_unescaped_whitespace_escape_before = r'(?<!(?<!\x00)[\s\x00])'
675 non_whitespace_after = r'(?!\s)'
676 # Alphanumerics with isolated internal [-._+:] chars (i.e. not 2 together):
677 simplename = r'(?:(?!_)\w)+(?:[-._+:](?:(?!_)\w)+)*'
678 # Valid URI characters (see RFC 2396 & RFC 2732);
679 # final \x00 allows backslash escapes in URIs:
680 uric = r"""[-_.!~*'()[\];/:@&=+$,%a-zA-Z0-9\x00]"""
681 # Delimiter indicating the end of a URI (not part of the URI):
682 uri_end_delim = r"""[>]"""
683 # Last URI character; same as uric but no punctuation:
684 urilast = r"""[_~*/=+a-zA-Z0-9]"""
685 # End of a URI (either 'urilast' or 'uric followed by a
686 # uri_end_delim'):
687 uri_end = r"""(?:%(urilast)s|%(uric)s(?=%(uri_end_delim)s))""" % locals()
688 emailc = r"""[-_!~*'{|}/#?^`&=+$%a-zA-Z0-9\x00]"""
689 email_pattern = r"""
690 %(emailc)s+(?:\.%(emailc)s+)* # name
691 (?<!\x00)@ # at
692 %(emailc)s+(?:\.%(emailc)s*)* # host
693 %(uri_end)s # final URI char
694 """
695
696 def quoted_start(self, match):
697 """Test if inline markup start-string is 'quoted'.
698
699 'Quoted' in this context means the start-string is enclosed in a pair
700 of matching opening/closing delimiters (not necessarily quotes)
701 or at the end of the match.
702 """
703 string = match.string
704 start = match.start()
705 if start == 0: # start-string at beginning of text
706 return False
707 prestart = string[start - 1]
708 try:
709 poststart = string[match.end()]
710 except IndexError: # start-string at end of text
711 return True # not "quoted" but no markup start-string either
712 return punctuation_chars.match_chars(prestart, poststart)
713
714 def inline_obj(self, match, lineno, end_pattern, nodeclass,
715 restore_backslashes=False):
716 string = match.string
717 matchstart = match.start('start')
718 matchend = match.end('start')
719 if self.quoted_start(match):
720 return string[:matchend], [], string[matchend:], [], ''
721 endmatch = end_pattern.search(string[matchend:])
722 if endmatch and endmatch.start(1): # 1 or more chars
723 text = endmatch.string[:endmatch.start(1)]
724 if restore_backslashes:
725 text = unescape(text, True)
726 textend = matchend + endmatch.end(1)
727 rawsource = unescape(string[matchstart:textend], True)
728 node = nodeclass(rawsource, text)
729 return (string[:matchstart], [node],
730 string[textend:], [], endmatch.group(1))
731 msg = self.reporter.warning(
732 'Inline %s start-string without end-string.'
733 % nodeclass.__name__, line=lineno)
734 text = unescape(string[matchstart:matchend], True)
735 prb = self.problematic(text, text, msg)
736 return string[:matchstart], [prb], string[matchend:], [msg], ''
737
738 def problematic(self, text, rawsource, message):
739 msgid = self.document.set_id(message, self.parent)
740 problematic = nodes.problematic(rawsource, text, refid=msgid)
741 prbid = self.document.set_id(problematic)
742 message.add_backref(prbid)
743 return problematic
744
745 def emphasis(self, match, lineno):
746 before, inlines, remaining, sysmessages, endstring = self.inline_obj(
747 match, lineno, self.patterns.emphasis, nodes.emphasis)
748 return before, inlines, remaining, sysmessages
749
750 def strong(self, match, lineno):
751 before, inlines, remaining, sysmessages, endstring = self.inline_obj(
752 match, lineno, self.patterns.strong, nodes.strong)
753 return before, inlines, remaining, sysmessages
754
755 def interpreted_or_phrase_ref(self, match, lineno):
756 end_pattern = self.patterns.interpreted_or_phrase_ref
757 string = match.string
758 matchstart = match.start('backquote')
759 matchend = match.end('backquote')
760 rolestart = match.start('role')
761 role = match.group('role')
762 position = ''
763 if role:
764 role = role[1:-1]
765 position = 'prefix'
766 elif self.quoted_start(match):
767 return string[:matchend], [], string[matchend:], []
768 endmatch = end_pattern.search(string[matchend:])
769 if endmatch and endmatch.start(1): # 1 or more chars
770 textend = matchend + endmatch.end()
771 if endmatch.group('role'):
772 if role:
773 msg = self.reporter.warning(
774 'Multiple roles in interpreted text (both '
775 'prefix and suffix present; only one allowed).',
776 line=lineno)
777 text = unescape(string[rolestart:textend], True)
778 prb = self.problematic(text, text, msg)
779 return string[:rolestart], [prb], string[textend:], [msg]
780 role = endmatch.group('suffix')[1:-1]
781 position = 'suffix'
782 escaped = endmatch.string[:endmatch.start(1)]
783 rawsource = unescape(string[matchstart:textend], True)
784 if rawsource[-1:] == '_':
785 if role:
786 msg = self.reporter.warning(
787 'Mismatch: both interpreted text role %s and '
788 'reference suffix.' % position, line=lineno)
789 text = unescape(string[rolestart:textend], True)
790 prb = self.problematic(text, text, msg)
791 return string[:rolestart], [prb], string[textend:], [msg]
792 return self.phrase_ref(string[:matchstart], string[textend:],
793 rawsource, escaped)
794 else:
795 rawsource = unescape(string[rolestart:textend], True)
796 nodelist, messages = self.interpreted(rawsource, escaped, role,
797 lineno)
798 return (string[:rolestart], nodelist,
799 string[textend:], messages)
800 msg = self.reporter.warning(
801 'Inline interpreted text or phrase reference start-string '
802 'without end-string.', line=lineno)
803 text = unescape(string[matchstart:matchend], True)
804 prb = self.problematic(text, text, msg)
805 return string[:matchstart], [prb], string[matchend:], [msg]
806
807 def phrase_ref(self, before, after, rawsource, escaped, text=None):
808 # `text` is ignored (since 0.16)
809 match = self.patterns.embedded_link.search(escaped)
810 if match: # embedded <URI> or <alias_>
811 text = escaped[:match.start(0)]
812 unescaped = unescape(text)
813 rawtext = unescape(text, True)
814 aliastext = match.group(2)
815 rawaliastext = unescape(aliastext, True)
816 underscore_escaped = rawaliastext.endswith(r'\_')
817 if (aliastext.endswith('_')
818 and not (underscore_escaped
819 or self.patterns.uri.match(aliastext))):
820 aliastype = 'name'
821 alias = normalize_name(unescape(aliastext[:-1]))
822 target = nodes.target(match.group(1), refname=alias)
823 target.indirect_reference_name = whitespace_normalize_name(
824 unescape(aliastext[:-1]))
825 else:
826 aliastype = 'uri'
827 # remove unescaped whitespace
828 alias_parts = split_escaped_whitespace(match.group(2))
829 alias = ' '.join(''.join(part.split())
830 for part in alias_parts)
831 alias = self.adjust_uri(unescape(alias))
832 if alias.endswith(r'\_'):
833 alias = alias[:-2] + '_'
834 target = nodes.target(match.group(1), refuri=alias)
835 target.referenced = 1
836 if not aliastext:
837 raise ApplicationError('problem with embedded link: %r'
838 % aliastext)
839 if not text:
840 text = alias
841 unescaped = unescape(text)
842 rawtext = rawaliastext
843 else:
844 text = escaped
845 unescaped = unescape(text)
846 target = None
847 rawtext = unescape(escaped, True)
848
849 refname = normalize_name(unescaped)
850 reference = nodes.reference(rawsource, text,
851 name=whitespace_normalize_name(unescaped))
852 reference[0].rawsource = rawtext
853
854 node_list = [reference]
855
856 if rawsource[-2:] == '__':
857 if target and (aliastype == 'name'):
858 reference['refname'] = alias
859 self.document.note_refname(reference)
860 # self.document.note_indirect_target(target) # required?
861 elif target and (aliastype == 'uri'):
862 reference['refuri'] = alias
863 else:
864 reference['anonymous'] = True
865 else:
866 if target:
867 target['names'].append(refname)
868 if aliastype == 'name':
869 reference['refname'] = alias
870 self.document.note_indirect_target(target)
871 self.document.note_refname(reference)
872 else:
873 reference['refuri'] = alias
874 self.document.note_explicit_target(target, self.parent)
875 # target.note_referenced_by(name=refname)
876 node_list.append(target)
877 else:
878 reference['refname'] = refname
879 self.document.note_refname(reference)
880 return before, node_list, after, []
881
882 def adjust_uri(self, uri):
883 match = self.patterns.email.match(uri)
884 if match:
885 return 'mailto:' + uri
886 else:
887 return uri
888
889 def interpreted(self, rawsource, text, role, lineno):
890 role_fn, messages = roles.role(role, self.language, lineno,
891 self.reporter)
892 if role_fn:
893 nodes, messages2 = role_fn(role, rawsource, text, lineno, self)
894 return nodes, messages + messages2
895 else:
896 msg = self.reporter.error(
897 'Unknown interpreted text role "%s".' % role,
898 line=lineno)
899 return ([self.problematic(rawsource, rawsource, msg)],
900 messages + [msg])
901
902 def literal(self, match, lineno):
903 before, inlines, remaining, sysmessages, endstring = self.inline_obj(
904 match, lineno, self.patterns.literal, nodes.literal,
905 restore_backslashes=True)
906 return before, inlines, remaining, sysmessages
907
908 def inline_internal_target(self, match, lineno):
909 before, inlines, remaining, sysmessages, endstring = self.inline_obj(
910 match, lineno, self.patterns.target, nodes.target)
911 if inlines and isinstance(inlines[0], nodes.target):
912 assert len(inlines) == 1
913 target = inlines[0]
914 name = normalize_name(target.astext())
915 target['names'].append(name)
916 self.document.note_explicit_target(target, self.parent)
917 return before, inlines, remaining, sysmessages
918
919 def substitution_reference(self, match, lineno):
920 before, inlines, remaining, sysmessages, endstring = self.inline_obj(
921 match, lineno, self.patterns.substitution_ref,
922 nodes.substitution_reference)
923 if len(inlines) == 1:
924 subref_node = inlines[0]
925 if isinstance(subref_node, nodes.substitution_reference):
926 subref_text = subref_node.astext()
927 self.document.note_substitution_ref(subref_node, subref_text)
928 if endstring[-1:] == '_':
929 reference_node = nodes.reference(
930 '|%s%s' % (subref_text, endstring), '')
931 if endstring[-2:] == '__':
932 reference_node['anonymous'] = True
933 else:
934 reference_node['refname'] = normalize_name(subref_text)
935 self.document.note_refname(reference_node)
936 reference_node += subref_node
937 inlines = [reference_node]
938 return before, inlines, remaining, sysmessages
939
940 def footnote_reference(self, match, lineno):
941 """
942 Handles `nodes.footnote_reference` and `nodes.citation_reference`
943 elements.
944 """
945 label = match.group('footnotelabel')
946 refname = normalize_name(label)
947 string = match.string
948 before = string[:match.start('whole')]
949 remaining = string[match.end('whole'):]
950 if match.group('citationlabel'):
951 refnode = nodes.citation_reference('[%s]_' % label,
952 refname=refname)
953 refnode += nodes.Text(label)
954 self.document.note_citation_ref(refnode)
955 else:
956 refnode = nodes.footnote_reference('[%s]_' % label)
957 if refname[0] == '#':
958 refname = refname[1:]
959 refnode['auto'] = 1
960 self.document.note_autofootnote_ref(refnode)
961 elif refname == '*':
962 refname = ''
963 refnode['auto'] = '*'
964 self.document.note_symbol_footnote_ref(
965 refnode)
966 else:
967 refnode += nodes.Text(label)
968 if refname:
969 refnode['refname'] = refname
970 self.document.note_footnote_ref(refnode)
971 if utils.get_trim_footnote_ref_space(self.document.settings):
972 before = before.rstrip()
973 return before, [refnode], remaining, []
974
975 def reference(self, match, lineno, anonymous=False):
976 referencename = match.group('refname')
977 refname = normalize_name(referencename)
978 referencenode = nodes.reference(
979 referencename + match.group('refend'), referencename,
980 name=whitespace_normalize_name(referencename))
981 referencenode[0].rawsource = referencename
982 if anonymous:
983 referencenode['anonymous'] = True
984 else:
985 referencenode['refname'] = refname
986 self.document.note_refname(referencenode)
987 string = match.string
988 matchstart = match.start('whole')
989 matchend = match.end('whole')
990 return string[:matchstart], [referencenode], string[matchend:], []
991
992 def anonymous_reference(self, match, lineno):
993 return self.reference(match, lineno, anonymous=True)
994
995 def standalone_uri(self, match, lineno):
996 if (not match.group('scheme')
997 or match.group('scheme').lower() in urischemes.schemes):
998 if match.group('email'):
999 addscheme = 'mailto:'
1000 else:
1001 addscheme = ''
1002 text = match.group('whole')
1003 refuri = addscheme + unescape(text)
1004 reference = nodes.reference(unescape(text, True), text,
1005 refuri=refuri)
1006 return [reference]
1007 else: # not a valid scheme
1008 raise MarkupMismatch
1009
1010 def pep_reference(self, match, lineno):
1011 text = match.group(0)
1012 if text.startswith('pep-'):
1013 pepnum = int(unescape(match.group('pepnum1')))
1014 elif text.startswith('PEP'):
1015 pepnum = int(unescape(match.group('pepnum2')))
1016 else:
1017 raise MarkupMismatch
1018 ref = (self.document.settings.pep_base_url
1019 + self.document.settings.pep_file_url_template % pepnum)
1020 return [nodes.reference(unescape(text, True), text, refuri=ref)]
1021
1022 rfc_url = 'rfc%d.html'
1023
1024 def rfc_reference(self, match, lineno):
1025 text = match.group(0)
1026 if text.startswith('RFC'):
1027 rfcnum = int(unescape(match.group('rfcnum')))
1028 ref = self.document.settings.rfc_base_url + self.rfc_url % rfcnum
1029 else:
1030 raise MarkupMismatch
1031 return [nodes.reference(unescape(text, True), text, refuri=ref)]
1032
1033 def implicit_inline(self, text, lineno):
1034 """
1035 Check each of the patterns in `self.implicit_dispatch` for a match,
1036 and dispatch to the stored method for the pattern. Recursively check
1037 the text before and after the match. Return a list of `nodes.Text`
1038 and inline element nodes.
1039 """
1040 if not text:
1041 return []
1042 for pattern, method in self.implicit_dispatch:
1043 match = pattern.search(text)
1044 if match:
1045 try:
1046 # Must recurse on strings before *and* after the match;
1047 # there may be multiple patterns.
1048 return (self.implicit_inline(text[:match.start()], lineno)
1049 + method(match, lineno)
1050 + self.implicit_inline(text[match.end():], lineno))
1051 except MarkupMismatch:
1052 pass
1053 return [nodes.Text(text)]
1054
1055 dispatch = {'*': emphasis,
1056 '**': strong,
1057 '`': interpreted_or_phrase_ref,
1058 '``': literal,
1059 '_`': inline_internal_target,
1060 ']_': footnote_reference,
1061 '|': substitution_reference,
1062 '_': reference,
1063 '__': anonymous_reference}
1064
1065
1066def _loweralpha_to_int(s, _zero=(ord('a')-1)):
1067 return ord(s) - _zero
1068
1069
1070def _upperalpha_to_int(s, _zero=(ord('A')-1)):
1071 return ord(s) - _zero
1072
1073
1074class Body(RSTState):
1075
1076 """
1077 Generic classifier of the first line of a block.
1078 """
1079
1080 double_width_pad_char = tableparser.TableParser.double_width_pad_char
1081 """Padding character for East Asian double-width text."""
1082
1083 enum = Struct()
1084 """Enumerated list parsing information."""
1085
1086 enum.formatinfo = {
1087 'parens': Struct(prefix='(', suffix=')', start=1, end=-1),
1088 'rparen': Struct(prefix='', suffix=')', start=0, end=-1),
1089 'period': Struct(prefix='', suffix='.', start=0, end=-1)}
1090 enum.formats = enum.formatinfo.keys()
1091 enum.sequences = ['arabic', 'loweralpha', 'upperalpha',
1092 'lowerroman', 'upperroman'] # ORDERED!
1093 enum.sequencepats = {'arabic': '[0-9]+',
1094 'loweralpha': '[a-z]',
1095 'upperalpha': '[A-Z]',
1096 'lowerroman': '[ivxlcdm]+',
1097 'upperroman': '[IVXLCDM]+'}
1098 enum.converters = {'arabic': int,
1099 'loweralpha': _loweralpha_to_int,
1100 'upperalpha': _upperalpha_to_int,
1101 'lowerroman': RomanNumeral.from_string,
1102 'upperroman': RomanNumeral.from_string}
1103
1104 enum.sequenceregexps = {}
1105 for sequence in enum.sequences:
1106 enum.sequenceregexps[sequence] = re.compile(
1107 enum.sequencepats[sequence] + '$')
1108
1109 grid_table_top_pat = re.compile(r'\+-[-+]+-\+ *$')
1110 """Matches the top (& bottom) of a full table)."""
1111
1112 simple_table_top_pat = re.compile('=+( +=+)+ *$')
1113 """Matches the top of a simple table."""
1114
1115 simple_table_border_pat = re.compile('=+[ =]*$')
1116 """Matches the bottom & header bottom of a simple table."""
1117
1118 pats = {}
1119 """Fragments of patterns used by transitions."""
1120
1121 pats['nonalphanum7bit'] = '[!-/:-@[-`{-~]'
1122 pats['alpha'] = '[a-zA-Z]'
1123 pats['alphanum'] = '[a-zA-Z0-9]'
1124 pats['alphanumplus'] = '[a-zA-Z0-9_-]'
1125 pats['enum'] = ('(%(arabic)s|%(loweralpha)s|%(upperalpha)s|%(lowerroman)s'
1126 '|%(upperroman)s|#)' % enum.sequencepats)
1127 pats['optname'] = '%(alphanum)s%(alphanumplus)s*' % pats
1128 # @@@ Loosen up the pattern? Allow Unicode?
1129 pats['optarg'] = '(%(alpha)s%(alphanumplus)s*|<[^<>]+>)' % pats
1130 pats['shortopt'] = r'(-|\+)%(alphanum)s( ?%(optarg)s)?' % pats
1131 pats['longopt'] = r'(--|/)%(optname)s([ =]%(optarg)s)?' % pats
1132 pats['option'] = r'(%(shortopt)s|%(longopt)s)' % pats
1133
1134 for format in enum.formats:
1135 pats[format] = '(?P<%s>%s%s%s)' % (
1136 format, re.escape(enum.formatinfo[format].prefix),
1137 pats['enum'], re.escape(enum.formatinfo[format].suffix))
1138
1139 patterns = {
1140 'bullet': '[-+*\u2022\u2023\u2043]( +|$)',
1141 'enumerator': r'(%(parens)s|%(rparen)s|%(period)s)( +|$)' % pats,
1142 'field_marker': r':(?![: ])([^:\\]|\\.|:(?!([ `]|$)))*(?<! ):( +|$)',
1143 'option_marker': r'%(option)s(, %(option)s)*( +| ?$)' % pats,
1144 'doctest': r'>>>( +|$)',
1145 'line_block': r'\|( +|$)',
1146 'grid_table_top': grid_table_top_pat,
1147 'simple_table_top': simple_table_top_pat,
1148 'explicit_markup': r'\.\.( +|$)',
1149 'anonymous': r'__( +|$)',
1150 'line': r'(%(nonalphanum7bit)s)\1* *$' % pats,
1151 'text': r''}
1152 initial_transitions = (
1153 'bullet',
1154 'enumerator',
1155 'field_marker',
1156 'option_marker',
1157 'doctest',
1158 'line_block',
1159 'grid_table_top',
1160 'simple_table_top',
1161 'explicit_markup',
1162 'anonymous',
1163 'line',
1164 'text')
1165
1166 def indent(self, match, context, next_state):
1167 """Block quote."""
1168 (indented, indent, line_offset, blank_finish
1169 ) = self.state_machine.get_indented()
1170 elements = self.block_quote(indented, line_offset)
1171 self.parent += elements
1172 if not blank_finish:
1173 self.parent += self.unindent_warning('Block quote')
1174 return context, next_state, []
1175
1176 def block_quote(self, indented, line_offset):
1177 elements = []
1178 while indented:
1179 blockquote = nodes.block_quote(rawsource='\n'.join(indented))
1180 (blockquote.source, blockquote.line
1181 ) = self.state_machine.get_source_and_line(line_offset+1)
1182 (blockquote_lines,
1183 attribution_lines,
1184 attribution_offset,
1185 indented,
1186 new_line_offset) = self.split_attribution(indented, line_offset)
1187 self.nested_parse(blockquote_lines, line_offset, blockquote)
1188 elements.append(blockquote)
1189 if attribution_lines:
1190 attribution, messages = self.parse_attribution(
1191 attribution_lines, line_offset+attribution_offset)
1192 blockquote += attribution
1193 elements += messages
1194 line_offset = new_line_offset
1195 while indented and not indented[0]:
1196 indented = indented[1:]
1197 line_offset += 1
1198 return elements
1199
1200 # U+2014 is an em-dash:
1201 attribution_pattern = re.compile('(---?(?!-)|\u2014) *(?=[^ \\n])')
1202
1203 def split_attribution(self, indented, line_offset):
1204 """
1205 Check for a block quote attribution and split it off:
1206
1207 * First line after a blank line must begin with a dash ("--", "---",
1208 em-dash; matches `self.attribution_pattern`).
1209 * Every line after that must have consistent indentation.
1210 * Attributions must be preceded by block quote content.
1211
1212 Return a tuple of: (block quote content lines, attribution lines,
1213 attribution offset, remaining indented lines, remaining lines offset).
1214 """
1215 blank = None
1216 nonblank_seen = False
1217 for i in range(len(indented)):
1218 line = indented[i].rstrip()
1219 if line:
1220 if nonblank_seen and blank == i - 1: # last line blank
1221 match = self.attribution_pattern.match(line)
1222 if match:
1223 attribution_end, indent = self.check_attribution(
1224 indented, i)
1225 if attribution_end:
1226 a_lines = indented[i:attribution_end]
1227 a_lines.trim_left(match.end(), end=1)
1228 a_lines.trim_left(indent, start=1)
1229 return (indented[:i], a_lines,
1230 i, indented[attribution_end:],
1231 line_offset + attribution_end)
1232 nonblank_seen = True
1233 else:
1234 blank = i
1235 else:
1236 return indented, None, None, None, None
1237
1238 def check_attribution(self, indented, attribution_start):
1239 """
1240 Check attribution shape.
1241 Return the index past the end of the attribution, and the indent.
1242 """
1243 indent = None
1244 i = attribution_start + 1
1245 for i in range(attribution_start + 1, len(indented)):
1246 line = indented[i].rstrip()
1247 if not line:
1248 break
1249 if indent is None:
1250 indent = len(line) - len(line.lstrip())
1251 elif len(line) - len(line.lstrip()) != indent:
1252 return None, None # bad shape; not an attribution
1253 else:
1254 # return index of line after last attribution line:
1255 i += 1
1256 return i, (indent or 0)
1257
1258 def parse_attribution(self, indented, line_offset):
1259 text = '\n'.join(indented).rstrip()
1260 lineno = 1 + line_offset # line_offset is zero-based
1261 textnodes, messages = self.inline_text(text, lineno)
1262 node = nodes.attribution(text, '', *textnodes)
1263 node.source, node.line = self.state_machine.get_source_and_line(lineno)
1264 return node, messages
1265
1266 def bullet(self, match, context, next_state):
1267 """Bullet list item."""
1268 ul = nodes.bullet_list()
1269 ul.source, ul.line = self.state_machine.get_source_and_line()
1270 self.parent += ul
1271 ul['bullet'] = match.string[0]
1272 i, blank_finish = self.list_item(match.end())
1273 ul += i
1274 offset = self.state_machine.line_offset + 1 # next line
1275 new_line_offset, blank_finish = self.nested_list_parse(
1276 self.state_machine.input_lines[offset:],
1277 input_offset=self.state_machine.abs_line_offset() + 1,
1278 node=ul, initial_state='BulletList',
1279 blank_finish=blank_finish)
1280 self.goto_line(new_line_offset)
1281 if not blank_finish:
1282 self.parent += self.unindent_warning('Bullet list')
1283 return [], next_state, []
1284
1285 def list_item(self, indent):
1286 src, srcline = self.state_machine.get_source_and_line()
1287 if self.state_machine.line[indent:]:
1288 indented, line_offset, blank_finish = (
1289 self.state_machine.get_known_indented(indent))
1290 else:
1291 indented, indent, line_offset, blank_finish = (
1292 self.state_machine.get_first_known_indented(indent))
1293 listitem = nodes.list_item('\n'.join(indented))
1294 listitem.source, listitem.line = src, srcline
1295 if indented:
1296 self.nested_parse(indented, input_offset=line_offset,
1297 node=listitem)
1298 return listitem, blank_finish
1299
1300 def enumerator(self, match, context, next_state):
1301 """Enumerated List Item"""
1302 format, sequence, text, ordinal = self.parse_enumerator(match)
1303 if not self.is_enumerated_list_item(ordinal, sequence, format):
1304 raise statemachine.TransitionCorrection('text')
1305 enumlist = nodes.enumerated_list()
1306 self.parent += enumlist
1307 if sequence == '#':
1308 enumlist['enumtype'] = 'arabic'
1309 else:
1310 enumlist['enumtype'] = sequence
1311 enumlist['prefix'] = self.enum.formatinfo[format].prefix
1312 enumlist['suffix'] = self.enum.formatinfo[format].suffix
1313 if ordinal != 1:
1314 enumlist['start'] = ordinal
1315 msg = self.reporter.info(
1316 'Enumerated list start value not ordinal-1: "%s" (ordinal %s)'
1317 % (text, ordinal))
1318 self.parent += msg
1319 listitem, blank_finish = self.list_item(match.end())
1320 enumlist += listitem
1321 offset = self.state_machine.line_offset + 1 # next line
1322 newline_offset, blank_finish = self.nested_list_parse(
1323 self.state_machine.input_lines[offset:],
1324 input_offset=self.state_machine.abs_line_offset() + 1,
1325 node=enumlist, initial_state='EnumeratedList',
1326 blank_finish=blank_finish,
1327 extra_settings={'lastordinal': ordinal,
1328 'format': format,
1329 'auto': sequence == '#'})
1330 self.goto_line(newline_offset)
1331 if not blank_finish:
1332 self.parent += self.unindent_warning('Enumerated list')
1333 return [], next_state, []
1334
1335 def parse_enumerator(self, match, expected_sequence=None):
1336 """
1337 Analyze an enumerator and return the results.
1338
1339 :Return:
1340 - the enumerator format ('period', 'parens', or 'rparen'),
1341 - the sequence used ('arabic', 'loweralpha', 'upperroman', etc.),
1342 - the text of the enumerator, stripped of formatting, and
1343 - the ordinal value of the enumerator ('a' -> 1, 'ii' -> 2, etc.;
1344 ``None`` is returned for invalid enumerator text).
1345
1346 The enumerator format has already been determined by the regular
1347 expression match. If `expected_sequence` is given, that sequence is
1348 tried first. If not, we check for Roman numeral 1. This way,
1349 single-character Roman numerals (which are also alphabetical) can be
1350 matched. If no sequence has been matched, all sequences are checked in
1351 order.
1352 """
1353 groupdict = match.groupdict()
1354 sequence = ''
1355 for format in self.enum.formats:
1356 if groupdict[format]: # was this the format matched?
1357 break # yes; keep `format`
1358 else: # shouldn't happen
1359 raise ParserError('enumerator format not matched')
1360 text = groupdict[format][self.enum.formatinfo[format].start # noqa: E203,E501
1361 : self.enum.formatinfo[format].end]
1362 if text == '#':
1363 sequence = '#'
1364 elif expected_sequence:
1365 try:
1366 if self.enum.sequenceregexps[expected_sequence].match(text):
1367 sequence = expected_sequence
1368 except KeyError: # shouldn't happen
1369 raise ParserError('unknown enumerator sequence: %s'
1370 % sequence)
1371 elif text == 'i':
1372 sequence = 'lowerroman'
1373 elif text == 'I':
1374 sequence = 'upperroman'
1375 if not sequence:
1376 for sequence in self.enum.sequences:
1377 if self.enum.sequenceregexps[sequence].match(text):
1378 break
1379 else: # shouldn't happen
1380 raise ParserError('enumerator sequence not matched')
1381 if sequence == '#':
1382 ordinal = 1
1383 else:
1384 try:
1385 ordinal = int(self.enum.converters[sequence](text))
1386 except InvalidRomanNumeralError:
1387 ordinal = None
1388 return format, sequence, text, ordinal
1389
1390 def is_enumerated_list_item(self, ordinal, sequence, format):
1391 """
1392 Check validity based on the ordinal value and the second line.
1393
1394 Return true if the ordinal is valid and the second line is blank,
1395 indented, or starts with the next enumerator or an auto-enumerator.
1396 """
1397 if ordinal is None:
1398 return None
1399 try:
1400 next_line = self.state_machine.next_line()
1401 except EOFError: # end of input lines
1402 self.state_machine.previous_line()
1403 return 1
1404 else:
1405 self.state_machine.previous_line()
1406 if not next_line[:1].strip(): # blank or indented
1407 return 1
1408 result = self.make_enumerator(ordinal + 1, sequence, format)
1409 if result:
1410 next_enumerator, auto_enumerator = result
1411 try:
1412 if next_line.startswith((next_enumerator, auto_enumerator)):
1413 return 1
1414 except TypeError:
1415 pass
1416 return None
1417
1418 def make_enumerator(self, ordinal, sequence, format):
1419 """
1420 Construct and return the next enumerated list item marker, and an
1421 auto-enumerator ("#" instead of the regular enumerator).
1422
1423 Return ``None`` for invalid (out of range) ordinals.
1424 """
1425 if sequence == '#':
1426 enumerator = '#'
1427 elif sequence == 'arabic':
1428 enumerator = str(ordinal)
1429 else:
1430 if sequence.endswith('alpha'):
1431 if ordinal > 26:
1432 return None
1433 enumerator = chr(ordinal + ord('a') - 1)
1434 elif sequence.endswith('roman'):
1435 try:
1436 enumerator = RomanNumeral(ordinal).to_uppercase()
1437 except TypeError:
1438 return None
1439 else: # shouldn't happen
1440 raise ParserError('unknown enumerator sequence: "%s"'
1441 % sequence)
1442 if sequence.startswith('lower'):
1443 enumerator = enumerator.lower()
1444 elif sequence.startswith('upper'):
1445 enumerator = enumerator.upper()
1446 else: # shouldn't happen
1447 raise ParserError('unknown enumerator sequence: "%s"'
1448 % sequence)
1449 formatinfo = self.enum.formatinfo[format]
1450 next_enumerator = (formatinfo.prefix + enumerator + formatinfo.suffix
1451 + ' ')
1452 auto_enumerator = formatinfo.prefix + '#' + formatinfo.suffix + ' '
1453 return next_enumerator, auto_enumerator
1454
1455 def field_marker(self, match, context, next_state):
1456 """Field list item."""
1457 field_list = nodes.field_list()
1458 self.parent += field_list
1459 field, blank_finish = self.field(match)
1460 field_list += field
1461 offset = self.state_machine.line_offset + 1 # next line
1462 newline_offset, blank_finish = self.nested_list_parse(
1463 self.state_machine.input_lines[offset:],
1464 input_offset=self.state_machine.abs_line_offset() + 1,
1465 node=field_list, initial_state='FieldList',
1466 blank_finish=blank_finish)
1467 self.goto_line(newline_offset)
1468 if not blank_finish:
1469 self.parent += self.unindent_warning('Field list')
1470 return [], next_state, []
1471
1472 def field(self, match):
1473 name = self.parse_field_marker(match)
1474 src, srcline = self.state_machine.get_source_and_line()
1475 lineno = self.state_machine.abs_line_number()
1476 (indented, indent, line_offset, blank_finish
1477 ) = self.state_machine.get_first_known_indented(match.end())
1478 field_node = nodes.field()
1479 field_node.source = src
1480 field_node.line = srcline
1481 name_nodes, name_messages = self.inline_text(name, lineno)
1482 field_node += nodes.field_name(name, '', *name_nodes)
1483 field_body = nodes.field_body('\n'.join(indented), *name_messages)
1484 field_node += field_body
1485 if indented:
1486 self.parse_field_body(indented, line_offset, field_body)
1487 return field_node, blank_finish
1488
1489 def parse_field_marker(self, match):
1490 """Extract & return field name from a field marker match."""
1491 field = match.group()[1:] # strip off leading ':'
1492 field = field[:field.rfind(':')] # strip off trailing ':' etc.
1493 return field
1494
1495 def parse_field_body(self, indented, offset, node) -> None:
1496 self.nested_parse(indented, input_offset=offset, node=node)
1497
1498 def option_marker(self, match, context, next_state):
1499 """Option list item."""
1500 optionlist = nodes.option_list()
1501 (optionlist.source, optionlist.line
1502 ) = self.state_machine.get_source_and_line()
1503 try:
1504 listitem, blank_finish = self.option_list_item(match)
1505 except MarkupError as error:
1506 # This shouldn't happen; pattern won't match.
1507 msg = self.reporter.error('Invalid option list marker: %s'
1508 % error)
1509 self.parent += msg
1510 (indented, indent, line_offset, blank_finish
1511 ) = self.state_machine.get_first_known_indented(match.end())
1512 elements = self.block_quote(indented, line_offset)
1513 self.parent += elements
1514 if not blank_finish:
1515 self.parent += self.unindent_warning('Option list')
1516 return [], next_state, []
1517 self.parent += optionlist
1518 optionlist += listitem
1519 offset = self.state_machine.line_offset + 1 # next line
1520 newline_offset, blank_finish = self.nested_list_parse(
1521 self.state_machine.input_lines[offset:],
1522 input_offset=self.state_machine.abs_line_offset() + 1,
1523 node=optionlist, initial_state='OptionList',
1524 blank_finish=blank_finish)
1525 self.goto_line(newline_offset)
1526 if not blank_finish:
1527 self.parent += self.unindent_warning('Option list')
1528 return [], next_state, []
1529
1530 def option_list_item(self, match):
1531 offset = self.state_machine.abs_line_offset()
1532 options = self.parse_option_marker(match)
1533 (indented, indent, line_offset, blank_finish
1534 ) = self.state_machine.get_first_known_indented(match.end())
1535 if not indented: # not an option list item
1536 self.goto_line(offset)
1537 raise statemachine.TransitionCorrection('text')
1538 option_group = nodes.option_group('', *options)
1539 description = nodes.description('\n'.join(indented))
1540 option_list_item = nodes.option_list_item('', option_group,
1541 description)
1542 if indented:
1543 self.nested_parse(indented, input_offset=line_offset,
1544 node=description)
1545 return option_list_item, blank_finish
1546
1547 def parse_option_marker(self, match):
1548 """
1549 Return a list of `node.option` and `node.option_argument` objects,
1550 parsed from an option marker match.
1551
1552 :Exception: `MarkupError` for invalid option markers.
1553 """
1554 optlist = []
1555 # split at ", ", except inside < > (complex arguments)
1556 optionstrings = re.split(r', (?![^<]*>)', match.group().rstrip())
1557 for optionstring in optionstrings:
1558 tokens = optionstring.split()
1559 delimiter = ' '
1560 firstopt = tokens[0].split('=', 1)
1561 if len(firstopt) > 1:
1562 # "--opt=value" form
1563 tokens[:1] = firstopt
1564 delimiter = '='
1565 elif (len(tokens[0]) > 2
1566 and ((tokens[0].startswith('-')
1567 and not tokens[0].startswith('--'))
1568 or tokens[0].startswith('+'))):
1569 # "-ovalue" form
1570 tokens[:1] = [tokens[0][:2], tokens[0][2:]]
1571 delimiter = ''
1572 if len(tokens) > 1 and (tokens[1].startswith('<')
1573 and tokens[-1].endswith('>')):
1574 # "-o <value1 value2>" form; join all values into one token
1575 tokens[1:] = [' '.join(tokens[1:])]
1576 if 0 < len(tokens) <= 2:
1577 option = nodes.option(optionstring)
1578 option += nodes.option_string(tokens[0], tokens[0])
1579 if len(tokens) > 1:
1580 option += nodes.option_argument(tokens[1], tokens[1],
1581 delimiter=delimiter)
1582 optlist.append(option)
1583 else:
1584 raise MarkupError(
1585 'wrong number of option tokens (=%s), should be 1 or 2: '
1586 '"%s"' % (len(tokens), optionstring))
1587 return optlist
1588
1589 def doctest(self, match, context, next_state):
1590 line = self.document.current_line
1591 data = '\n'.join(self.state_machine.get_text_block())
1592 # TODO: Parse with `directives.body.CodeBlock` with
1593 # argument 'pycon' (Python Console) in Docutils 1.0.
1594 n = nodes.doctest_block(data, data)
1595 n.line = line
1596 self.parent += n
1597 return [], next_state, []
1598
1599 def line_block(self, match, context, next_state):
1600 """First line of a line block."""
1601 block = nodes.line_block()
1602 self.parent += block
1603 lineno = self.state_machine.abs_line_number()
1604 (block.source,
1605 block.line) = self.state_machine.get_source_and_line(lineno)
1606 line, messages, blank_finish = self.line_block_line(match, lineno)
1607 block += line
1608 self.parent += messages
1609 if not blank_finish:
1610 offset = self.state_machine.line_offset + 1 # next line
1611 new_line_offset, blank_finish = self.nested_list_parse(
1612 self.state_machine.input_lines[offset:],
1613 input_offset=self.state_machine.abs_line_offset() + 1,
1614 node=block, initial_state='LineBlock',
1615 blank_finish=0)
1616 self.goto_line(new_line_offset)
1617 if not blank_finish:
1618 self.parent += self.reporter.warning(
1619 'Line block ends without a blank line.',
1620 line=lineno+1)
1621 if len(block):
1622 if block[0].indent is None:
1623 block[0].indent = 0
1624 self.nest_line_block_lines(block)
1625 return [], next_state, []
1626
1627 def line_block_line(self, match, lineno):
1628 """Return one line element of a line_block."""
1629 (indented, indent, line_offset, blank_finish
1630 ) = self.state_machine.get_first_known_indented(match.end(),
1631 until_blank=True)
1632 text = '\n'.join(indented)
1633 text_nodes, messages = self.inline_text(text, lineno)
1634 line = nodes.line(text, '', *text_nodes)
1635 (line.source,
1636 line.line) = self.state_machine.get_source_and_line(lineno)
1637 if match.string.rstrip() != '|': # not empty
1638 line.indent = len(match.group(1)) - 1
1639 return line, messages, blank_finish
1640
1641 def nest_line_block_lines(self, block) -> None:
1642 for index in range(1, len(block)):
1643 if getattr(block[index], 'indent', None) is None:
1644 block[index].indent = block[index - 1].indent
1645 self.nest_line_block_segment(block)
1646
1647 def nest_line_block_segment(self, block) -> None:
1648 indents = [item.indent for item in block]
1649 least = min(indents)
1650 new_items = []
1651 new_block = nodes.line_block()
1652 for item in block:
1653 if item.indent > least:
1654 new_block.append(item)
1655 else:
1656 if len(new_block):
1657 self.nest_line_block_segment(new_block)
1658 new_items.append(new_block)
1659 new_block = nodes.line_block()
1660 new_items.append(item)
1661 if len(new_block):
1662 self.nest_line_block_segment(new_block)
1663 new_items.append(new_block)
1664 block[:] = new_items
1665
1666 def grid_table_top(self, match, context, next_state):
1667 """Top border of a full table."""
1668 return self.table_top(match, context, next_state,
1669 self.isolate_grid_table,
1670 tableparser.GridTableParser)
1671
1672 def simple_table_top(self, match, context, next_state):
1673 """Top border of a simple table."""
1674 return self.table_top(match, context, next_state,
1675 self.isolate_simple_table,
1676 tableparser.SimpleTableParser)
1677
1678 def table_top(self, match, context, next_state,
1679 isolate_function, parser_class):
1680 """Top border of a generic table."""
1681 nodelist, blank_finish = self.table(isolate_function, parser_class)
1682 self.parent += nodelist
1683 if not blank_finish:
1684 msg = self.reporter.warning(
1685 'Blank line required after table.',
1686 line=self.state_machine.abs_line_number()+1)
1687 self.parent += msg
1688 return [], next_state, []
1689
1690 def table(self, isolate_function, parser_class):
1691 """Parse a table."""
1692 block, messages, blank_finish = isolate_function()
1693 if block:
1694 try:
1695 parser = parser_class()
1696 tabledata = parser.parse(block)
1697 tableline = (self.state_machine.abs_line_number() - len(block)
1698 + 1)
1699 table = self.build_table(tabledata, tableline)
1700 nodelist = [table] + messages
1701 except tableparser.TableMarkupError as err:
1702 nodelist = self.malformed_table(block, ' '.join(err.args),
1703 offset=err.offset) + messages
1704 else:
1705 nodelist = messages
1706 return nodelist, blank_finish
1707
1708 def isolate_grid_table(self):
1709 messages = []
1710 blank_finish = 1
1711 try:
1712 block = self.state_machine.get_text_block(flush_left=True)
1713 except statemachine.UnexpectedIndentationError as err:
1714 block, src, srcline = err.args
1715 messages.append(self.reporter.error('Unexpected indentation.',
1716 source=src, line=srcline))
1717 blank_finish = 0
1718 block.disconnect()
1719 # for East Asian chars:
1720 block.pad_double_width(self.double_width_pad_char)
1721 width = len(block[0].strip())
1722 for i in range(len(block)):
1723 block[i] = block[i].strip()
1724 if block[i][0] not in '+|': # check left edge
1725 blank_finish = 0
1726 self.state_machine.previous_line(len(block) - i)
1727 del block[i:]
1728 break
1729 if not self.grid_table_top_pat.match(block[-1]): # find bottom
1730 blank_finish = 0
1731 # from second-last to third line of table:
1732 for i in range(len(block) - 2, 1, -1):
1733 if self.grid_table_top_pat.match(block[i]):
1734 self.state_machine.previous_line(len(block) - i + 1)
1735 del block[i+1:]
1736 break
1737 else:
1738 messages.extend(self.malformed_table(block))
1739 return [], messages, blank_finish
1740 for i in range(len(block)): # check right edge
1741 if len(block[i]) != width or block[i][-1] not in '+|':
1742 messages.extend(self.malformed_table(block))
1743 return [], messages, blank_finish
1744 return block, messages, blank_finish
1745
1746 def isolate_simple_table(self):
1747 start = self.state_machine.line_offset
1748 lines = self.state_machine.input_lines
1749 limit = len(lines) - 1
1750 toplen = len(lines[start].strip())
1751 pattern_match = self.simple_table_border_pat.match
1752 found = 0
1753 found_at = None
1754 i = start + 1
1755 while i <= limit:
1756 line = lines[i]
1757 match = pattern_match(line)
1758 if match:
1759 if len(line.strip()) != toplen:
1760 self.state_machine.next_line(i - start)
1761 messages = self.malformed_table(
1762 lines[start:i+1], 'Bottom/header table border does '
1763 'not match top border.')
1764 return [], messages, i == limit or not lines[i+1].strip()
1765 found += 1
1766 found_at = i
1767 if found == 2 or i == limit or not lines[i+1].strip():
1768 end = i
1769 break
1770 i += 1
1771 else: # reached end of input_lines
1772 if found:
1773 extra = ' or no blank line after table bottom'
1774 self.state_machine.next_line(found_at - start)
1775 block = lines[start:found_at+1]
1776 else:
1777 extra = ''
1778 self.state_machine.next_line(i - start - 1)
1779 block = lines[start:]
1780 messages = self.malformed_table(
1781 block, 'No bottom table border found%s.' % extra)
1782 return [], messages, not extra
1783 self.state_machine.next_line(end - start)
1784 block = lines[start:end+1]
1785 # for East Asian chars:
1786 block.pad_double_width(self.double_width_pad_char)
1787 return block, [], end == limit or not lines[end+1].strip()
1788
1789 def malformed_table(self, block, detail='', offset=0):
1790 block.replace(self.double_width_pad_char, '')
1791 data = '\n'.join(block)
1792 message = 'Malformed table.'
1793 startline = self.state_machine.abs_line_number() - len(block) + 1
1794 if detail:
1795 message += '\n' + detail
1796 error = self.reporter.error(message, nodes.literal_block(data, data),
1797 line=startline+offset)
1798 return [error]
1799
1800 def build_table(self, tabledata, tableline, stub_columns=0, widths=None):
1801 colwidths, headrows, bodyrows = tabledata
1802 table = nodes.table()
1803 if widths == 'auto':
1804 table['classes'] += ['colwidths-auto']
1805 elif widths: # "grid" or list of integers
1806 table['classes'] += ['colwidths-given']
1807 tgroup = nodes.tgroup(cols=len(colwidths))
1808 table += tgroup
1809 for colwidth in colwidths:
1810 colspec = nodes.colspec(colwidth=colwidth)
1811 if stub_columns:
1812 colspec.attributes['stub'] = True
1813 stub_columns -= 1
1814 tgroup += colspec
1815 if headrows:
1816 thead = nodes.thead()
1817 tgroup += thead
1818 for row in headrows:
1819 thead += self.build_table_row(row, tableline)
1820 tbody = nodes.tbody()
1821 tgroup += tbody
1822 for row in bodyrows:
1823 tbody += self.build_table_row(row, tableline)
1824 return table
1825
1826 def build_table_row(self, rowdata, tableline):
1827 row = nodes.row()
1828 for cell in rowdata:
1829 if cell is None:
1830 continue
1831 morerows, morecols, offset, cellblock = cell
1832 attributes = {}
1833 if morerows:
1834 attributes['morerows'] = morerows
1835 if morecols:
1836 attributes['morecols'] = morecols
1837 entry = nodes.entry(**attributes)
1838 row += entry
1839 if ''.join(cellblock):
1840 self.nested_parse(cellblock, input_offset=tableline+offset,
1841 node=entry)
1842 return row
1843
1844 explicit = Struct()
1845 """Patterns and constants used for explicit markup recognition."""
1846
1847 explicit.patterns = Struct(
1848 target=re.compile(r"""
1849 (
1850 _ # anonymous target
1851 | # *OR*
1852 (?!_) # no underscore at the beginning
1853 (?P<quote>`?) # optional open quote
1854 (?![ `]) # first char. not space or
1855 # backquote
1856 (?P<name> # reference name
1857 .+?
1858 )
1859 %(non_whitespace_escape_before)s
1860 (?P=quote) # close quote if open quote used
1861 )
1862 (?<!(?<!\x00):) # no unescaped colon at end
1863 %(non_whitespace_escape_before)s
1864 [ ]? # optional space
1865 : # end of reference name
1866 ([ ]+|$) # followed by whitespace
1867 """ % vars(Inliner), re.VERBOSE),
1868 reference=re.compile(r"""
1869 (
1870 (?P<simple>%(simplename)s)_
1871 | # *OR*
1872 ` # open backquote
1873 (?![ ]) # not space
1874 (?P<phrase>.+?) # hyperlink phrase
1875 %(non_whitespace_escape_before)s
1876 `_ # close backquote,
1877 # reference mark
1878 )
1879 $ # end of string
1880 """ % vars(Inliner), re.VERBOSE),
1881 substitution=re.compile(r"""
1882 (
1883 (?![ ]) # first char. not space
1884 (?P<name>.+?) # substitution text
1885 %(non_whitespace_escape_before)s
1886 \| # close delimiter
1887 )
1888 ([ ]+|$) # followed by whitespace
1889 """ % vars(Inliner),
1890 re.VERBOSE),)
1891
1892 def footnote(self, match):
1893 src, srcline = self.state_machine.get_source_and_line()
1894 (indented, indent, offset, blank_finish
1895 ) = self.state_machine.get_first_known_indented(match.end())
1896 label = match.group(1)
1897 name = normalize_name(label)
1898 footnote = nodes.footnote('\n'.join(indented))
1899 footnote.source = src
1900 footnote.line = srcline
1901 if name[0] == '#': # auto-numbered
1902 name = name[1:] # autonumber label
1903 footnote['auto'] = 1
1904 if name:
1905 footnote['names'].append(name)
1906 self.document.note_autofootnote(footnote)
1907 elif name == '*': # auto-symbol
1908 name = ''
1909 footnote['auto'] = '*'
1910 self.document.note_symbol_footnote(footnote)
1911 else: # manually numbered
1912 footnote += nodes.label('', label)
1913 footnote['names'].append(name)
1914 self.document.note_footnote(footnote)
1915 if name:
1916 self.document.note_explicit_target(footnote, footnote)
1917 else:
1918 self.document.set_id(footnote, footnote)
1919 if indented:
1920 self.nested_parse(indented, input_offset=offset, node=footnote)
1921 else:
1922 footnote += self.reporter.warning('Footnote content expected.')
1923 return [footnote], blank_finish
1924
1925 def citation(self, match):
1926 src, srcline = self.state_machine.get_source_and_line()
1927 (indented, indent, offset, blank_finish
1928 ) = self.state_machine.get_first_known_indented(match.end())
1929 label = match.group(1)
1930 name = normalize_name(label)
1931 citation = nodes.citation('\n'.join(indented))
1932 citation.source = src
1933 citation.line = srcline
1934 citation += nodes.label('', label)
1935 citation['names'].append(name)
1936 self.document.note_citation(citation)
1937 self.document.note_explicit_target(citation, citation)
1938 if indented:
1939 self.nested_parse(indented, input_offset=offset, node=citation)
1940 else:
1941 citation += self.reporter.warning('Citation content expected.')
1942 return [citation], blank_finish
1943
1944 def hyperlink_target(self, match):
1945 pattern = self.explicit.patterns.target
1946 lineno = self.state_machine.abs_line_number()
1947 (block, indent, offset, blank_finish
1948 ) = self.state_machine.get_first_known_indented(
1949 match.end(), until_blank=True, strip_indent=False)
1950 blocktext = match.string[:match.end()] + '\n'.join(block)
1951 block = [escape2null(line) for line in block]
1952 escaped = block[0]
1953 blockindex = 0
1954 while True:
1955 targetmatch = pattern.match(escaped)
1956 if targetmatch:
1957 break
1958 blockindex += 1
1959 try:
1960 escaped += block[blockindex]
1961 except IndexError:
1962 raise MarkupError('malformed hyperlink target.')
1963 del block[:blockindex]
1964 block[0] = (block[0] + ' ')[targetmatch.end()-len(escaped)-1:].strip()
1965 target = self.make_target(block, blocktext, lineno,
1966 targetmatch.group('name'))
1967 return [target], blank_finish
1968
1969 def make_target(self, block, block_text, lineno, target_name):
1970 target_type, data = self.parse_target(block, block_text, lineno)
1971 if target_type == 'refname':
1972 target = nodes.target(block_text, '', refname=normalize_name(data))
1973 target.indirect_reference_name = data
1974 self.add_target(target_name, '', target, lineno)
1975 self.document.note_indirect_target(target)
1976 return target
1977 elif target_type == 'refuri':
1978 target = nodes.target(block_text, '')
1979 self.add_target(target_name, data, target, lineno)
1980 return target
1981 else:
1982 return data
1983
1984 def parse_target(self, block, block_text, lineno):
1985 """
1986 Determine the type of reference of a target.
1987
1988 :Return: A 2-tuple, one of:
1989
1990 - 'refname' and the indirect reference name
1991 - 'refuri' and the URI
1992 - 'malformed' and a system_message node
1993 """
1994 if block and block[-1].strip()[-1:] == '_': # possible indirect target
1995 reference = ' '.join(line.strip() for line in block)
1996 refname = self.is_reference(reference)
1997 if refname:
1998 return 'refname', refname
1999 ref_parts = split_escaped_whitespace(' '.join(block))
2000 reference = ' '.join(''.join(unescape(part).split())
2001 for part in ref_parts)
2002 return 'refuri', reference
2003
2004 def is_reference(self, reference):
2005 match = self.explicit.patterns.reference.match(
2006 whitespace_normalize_name(reference))
2007 if not match:
2008 return None
2009 return unescape(match.group('simple') or match.group('phrase'))
2010
2011 def add_target(self, targetname, refuri, target, lineno):
2012 target.line = lineno
2013 if targetname:
2014 name = normalize_name(unescape(targetname))
2015 target['names'].append(name)
2016 if refuri:
2017 uri = self.inliner.adjust_uri(refuri)
2018 if uri:
2019 target['refuri'] = uri
2020 else:
2021 raise ApplicationError('problem with URI: %r' % refuri)
2022 self.document.note_explicit_target(target, self.parent)
2023 else: # anonymous target
2024 if refuri:
2025 target['refuri'] = refuri
2026 target['anonymous'] = True
2027 self.document.note_anonymous_target(target)
2028
2029 def substitution_def(self, match):
2030 pattern = self.explicit.patterns.substitution
2031 src, srcline = self.state_machine.get_source_and_line()
2032 (block, indent, offset, blank_finish
2033 ) = self.state_machine.get_first_known_indented(match.end(),
2034 strip_indent=False)
2035 blocktext = (match.string[:match.end()] + '\n'.join(block))
2036 block.disconnect()
2037 escaped = escape2null(block[0].rstrip())
2038 blockindex = 0
2039 while True:
2040 subdefmatch = pattern.match(escaped)
2041 if subdefmatch:
2042 break
2043 blockindex += 1
2044 try:
2045 escaped = escaped + ' ' + escape2null(
2046 block[blockindex].strip())
2047 except IndexError:
2048 raise MarkupError('malformed substitution definition.')
2049 del block[:blockindex] # strip out the substitution marker
2050 start = subdefmatch.end()-len(escaped)-1
2051 block[0] = (block[0].strip() + ' ')[start:-1]
2052 if not block[0]:
2053 del block[0]
2054 offset += 1
2055 while block and not block[-1].strip():
2056 block.pop()
2057 subname = subdefmatch.group('name')
2058 substitution_node = nodes.substitution_definition(blocktext)
2059 substitution_node.source = src
2060 substitution_node.line = srcline
2061 if not block:
2062 msg = self.reporter.warning(
2063 'Substitution definition "%s" missing contents.' % subname,
2064 nodes.literal_block(blocktext, blocktext),
2065 source=src, line=srcline)
2066 return [msg], blank_finish
2067 block[0] = block[0].strip()
2068 substitution_node['names'].append(
2069 nodes.whitespace_normalize_name(subname))
2070 new_abs_offset, blank_finish = self.nested_list_parse(
2071 block, input_offset=offset, node=substitution_node,
2072 initial_state='SubstitutionDef', blank_finish=blank_finish)
2073 i = 0
2074 for node in substitution_node[:]:
2075 if not (isinstance(node, nodes.Inline)
2076 or isinstance(node, nodes.Text)):
2077 self.parent += substitution_node[i]
2078 del substitution_node[i]
2079 else:
2080 i += 1
2081 for node in substitution_node.findall(nodes.Element):
2082 if self.disallowed_inside_substitution_definitions(node):
2083 pformat = nodes.literal_block('', node.pformat().rstrip())
2084 msg = self.reporter.error(
2085 'Substitution definition contains illegal element <%s>:'
2086 % node.tagname,
2087 pformat, nodes.literal_block(blocktext, blocktext),
2088 source=src, line=srcline)
2089 return [msg], blank_finish
2090 if len(substitution_node) == 0:
2091 msg = self.reporter.warning(
2092 'Substitution definition "%s" empty or invalid.' % subname,
2093 nodes.literal_block(blocktext, blocktext),
2094 source=src, line=srcline)
2095 return [msg], blank_finish
2096 self.document.note_substitution_def(
2097 substitution_node, subname, self.parent)
2098 return [substitution_node], blank_finish
2099
2100 def disallowed_inside_substitution_definitions(self, node) -> bool:
2101 if (node['ids']
2102 or isinstance(node, nodes.reference) and node.get('anonymous')
2103 or isinstance(node, nodes.footnote_reference) and node.get('auto')): # noqa: E501
2104 return True
2105 else:
2106 return False
2107
2108 def directive(self, match, **option_presets):
2109 """Returns a 2-tuple: list of nodes, and a "blank finish" boolean."""
2110 type_name = match.group(1)
2111 directive_class, messages = directives.directive(
2112 type_name, self.memo.language, self.document)
2113 self.parent += messages
2114 if directive_class:
2115 return self.run_directive(
2116 directive_class, match, type_name, option_presets)
2117 else:
2118 return self.unknown_directive(type_name)
2119
2120 def run_directive(self, directive, match, type_name, option_presets):
2121 """
2122 Parse a directive then run its directive function.
2123
2124 Parameters:
2125
2126 - `directive`: The class implementing the directive. Must be
2127 a subclass of `rst.Directive`.
2128
2129 - `match`: A regular expression match object which matched the first
2130 line of the directive.
2131
2132 - `type_name`: The directive name, as used in the source text.
2133
2134 - `option_presets`: A dictionary of preset options, defaults for the
2135 directive options. Currently, only an "alt" option is passed by
2136 substitution definitions (value: the substitution name), which may
2137 be used by an embedded image directive.
2138
2139 Returns a 2-tuple: list of nodes, and a "blank finish" boolean.
2140 """
2141 if isinstance(directive, (FunctionType, MethodType)):
2142 from docutils.parsers.rst import convert_directive_function
2143 directive = convert_directive_function(directive)
2144 lineno = self.state_machine.abs_line_number()
2145 initial_line_offset = self.state_machine.line_offset
2146 (indented, indent, line_offset, blank_finish
2147 ) = self.state_machine.get_first_known_indented(match.end(),
2148 strip_top=0)
2149 block_text = '\n'.join(self.state_machine.input_lines[
2150 initial_line_offset : self.state_machine.line_offset + 1]) # noqa: E203,E501
2151 try:
2152 arguments, options, content, content_offset = (
2153 self.parse_directive_block(indented, line_offset,
2154 directive, option_presets))
2155 except MarkupError as detail:
2156 error = self.reporter.error(
2157 'Error in "%s" directive:\n%s.' % (type_name,
2158 ' '.join(detail.args)),
2159 nodes.literal_block(block_text, block_text), line=lineno)
2160 return [error], blank_finish
2161 directive_instance = directive(
2162 type_name, arguments, options, content, lineno,
2163 content_offset, block_text, self, self.state_machine)
2164 try:
2165 result = directive_instance.run()
2166 except docutils.parsers.rst.DirectiveError as error:
2167 msg_node = self.reporter.system_message(error.level, error.msg,
2168 line=lineno)
2169 msg_node += nodes.literal_block(block_text, block_text)
2170 result = [msg_node]
2171 assert isinstance(result, list), \
2172 'Directive "%s" must return a list of nodes.' % type_name
2173 for i in range(len(result)):
2174 assert isinstance(result[i], nodes.Node), \
2175 ('Directive "%s" returned non-Node object (index %s): %r'
2176 % (type_name, i, result[i]))
2177 return (result,
2178 blank_finish or self.state_machine.is_next_line_blank())
2179
2180 def parse_directive_block(self, indented, line_offset, directive,
2181 option_presets):
2182 option_spec = directive.option_spec
2183 has_content = directive.has_content
2184 if indented and not indented[0].strip():
2185 indented.trim_start()
2186 line_offset += 1
2187 while indented and not indented[-1].strip():
2188 indented.trim_end()
2189 if indented and (directive.required_arguments
2190 or directive.optional_arguments
2191 or option_spec):
2192 for i, line in enumerate(indented):
2193 if not line.strip():
2194 break
2195 else:
2196 i += 1
2197 arg_block = indented[:i]
2198 content = indented[i+1:]
2199 content_offset = line_offset + i + 1
2200 else:
2201 content = indented
2202 content_offset = line_offset
2203 arg_block = []
2204 if option_spec:
2205 options, arg_block = self.parse_directive_options(
2206 option_presets, option_spec, arg_block)
2207 else:
2208 options = {}
2209 if arg_block and not (directive.required_arguments
2210 or directive.optional_arguments):
2211 content = arg_block + indented[i:]
2212 content_offset = line_offset
2213 arg_block = []
2214 while content and not content[0].strip():
2215 content.trim_start()
2216 content_offset += 1
2217 if directive.required_arguments or directive.optional_arguments:
2218 arguments = self.parse_directive_arguments(
2219 directive, arg_block)
2220 else:
2221 arguments = []
2222 if content and not has_content:
2223 raise MarkupError('no content permitted')
2224 return arguments, options, content, content_offset
2225
2226 def parse_directive_options(self, option_presets, option_spec, arg_block):
2227 options = option_presets.copy()
2228 for i, line in enumerate(arg_block):
2229 if re.match(Body.patterns['field_marker'], line):
2230 opt_block = arg_block[i:]
2231 arg_block = arg_block[:i]
2232 break
2233 else:
2234 opt_block = []
2235 if opt_block:
2236 success, data = self.parse_extension_options(option_spec,
2237 opt_block)
2238 if success: # data is a dict of options
2239 options.update(data)
2240 else: # data is an error string
2241 raise MarkupError(data)
2242 return options, arg_block
2243
2244 def parse_directive_arguments(self, directive, arg_block):
2245 required = directive.required_arguments
2246 optional = directive.optional_arguments
2247 arg_text = '\n'.join(arg_block)
2248 arguments = arg_text.split()
2249 if len(arguments) < required:
2250 raise MarkupError('%s argument(s) required, %s supplied'
2251 % (required, len(arguments)))
2252 elif len(arguments) > required + optional:
2253 if directive.final_argument_whitespace:
2254 arguments = arg_text.split(None, required + optional - 1)
2255 else:
2256 raise MarkupError(
2257 'maximum %s argument(s) allowed, %s supplied'
2258 % (required + optional, len(arguments)))
2259 return arguments
2260
2261 def parse_extension_options(self, option_spec, datalines):
2262 """
2263 Parse `datalines` for a field list containing extension options
2264 matching `option_spec`.
2265
2266 :Parameters:
2267 - `option_spec`: a mapping of option name to conversion
2268 function, which should raise an exception on bad input.
2269 - `datalines`: a list of input strings.
2270
2271 :Return:
2272 - Success value, 1 or 0.
2273 - An option dictionary on success, an error string on failure.
2274 """
2275 node = nodes.field_list()
2276 newline_offset, blank_finish = self.nested_list_parse(
2277 datalines, 0, node, initial_state='ExtensionOptions',
2278 blank_finish=True)
2279 if newline_offset != len(datalines): # incomplete parse of block
2280 return 0, 'invalid option block'
2281 try:
2282 options = utils.extract_extension_options(node, option_spec)
2283 except KeyError as detail:
2284 return 0, 'unknown option: "%s"' % detail.args[0]
2285 except (ValueError, TypeError) as detail:
2286 return 0, 'invalid option value: %s' % ' '.join(detail.args)
2287 except utils.ExtensionOptionError as detail:
2288 return 0, 'invalid option data: %s' % ' '.join(detail.args)
2289 if blank_finish:
2290 return 1, options
2291 else:
2292 return 0, 'option data incompletely parsed'
2293
2294 def unknown_directive(self, type_name):
2295 lineno = self.state_machine.abs_line_number()
2296 (indented, indent, offset, blank_finish
2297 ) = self.state_machine.get_first_known_indented(0, strip_indent=False)
2298 text = '\n'.join(indented)
2299 error = self.reporter.error('Unknown directive type "%s".' % type_name,
2300 nodes.literal_block(text, text),
2301 line=lineno)
2302 return [error], blank_finish
2303
2304 def comment(self, match):
2305 if self.state_machine.is_next_line_blank():
2306 first_comment_line = match.string[match.end():]
2307 if not first_comment_line.strip(): # empty comment
2308 return [nodes.comment()], True # "A tiny but practical wart."
2309 if first_comment_line.startswith('end of inclusion from "'):
2310 # cf. parsers.rst.directives.misc.Include
2311 self.document.include_log.pop()
2312 return [], True
2313 (indented, indent, offset, blank_finish
2314 ) = self.state_machine.get_first_known_indented(match.end())
2315 while indented and not indented[-1].strip():
2316 indented.trim_end()
2317 text = '\n'.join(indented)
2318 return [nodes.comment(text, text)], blank_finish
2319
2320 explicit.constructs = [
2321 (footnote,
2322 re.compile(r"""
2323 \.\.[ ]+ # explicit markup start
2324 \[
2325 ( # footnote label:
2326 [0-9]+ # manually numbered footnote
2327 | # *OR*
2328 \# # anonymous auto-numbered footnote
2329 | # *OR*
2330 \#%s # auto-number ed?) footnote label
2331 | # *OR*
2332 \* # auto-symbol footnote
2333 )
2334 \]
2335 ([ ]+|$) # whitespace or end of line
2336 """ % Inliner.simplename, re.VERBOSE)),
2337 (citation,
2338 re.compile(r"""
2339 \.\.[ ]+ # explicit markup start
2340 \[(%s)\] # citation label
2341 ([ ]+|$) # whitespace or end of line
2342 """ % Inliner.simplename, re.VERBOSE)),
2343 (hyperlink_target,
2344 re.compile(r"""
2345 \.\.[ ]+ # explicit markup start
2346 _ # target indicator
2347 (?![ ]|$) # first char. not space or EOL
2348 """, re.VERBOSE)),
2349 (substitution_def,
2350 re.compile(r"""
2351 \.\.[ ]+ # explicit markup start
2352 \| # substitution indicator
2353 (?![ ]|$) # first char. not space or EOL
2354 """, re.VERBOSE)),
2355 (directive,
2356 re.compile(r"""
2357 \.\.[ ]+ # explicit markup start
2358 (%s) # directive name
2359 [ ]? # optional space
2360 :: # directive delimiter
2361 ([ ]+|$) # whitespace or end of line
2362 """ % Inliner.simplename, re.VERBOSE))]
2363
2364 def explicit_markup(self, match, context, next_state):
2365 """Footnotes, hyperlink targets, directives, comments."""
2366 nodelist, blank_finish = self.explicit_construct(match)
2367 self.parent += nodelist
2368 self.explicit_list(blank_finish)
2369 return [], next_state, []
2370
2371 def explicit_construct(self, match):
2372 """Determine which explicit construct this is, parse & return it."""
2373 errors = []
2374 for method, pattern in self.explicit.constructs:
2375 expmatch = pattern.match(match.string)
2376 if expmatch:
2377 try:
2378 return method(self, expmatch)
2379 except MarkupError as error:
2380 lineno = self.state_machine.abs_line_number()
2381 message = ' '.join(error.args)
2382 errors.append(self.reporter.warning(message, line=lineno))
2383 break
2384 nodelist, blank_finish = self.comment(match)
2385 return nodelist + errors, blank_finish
2386
2387 def explicit_list(self, blank_finish) -> None:
2388 """
2389 Create a nested state machine for a series of explicit markup
2390 constructs (including anonymous hyperlink targets).
2391 """
2392 offset = self.state_machine.line_offset + 1 # next line
2393 newline_offset, blank_finish = self.nested_list_parse(
2394 self.state_machine.input_lines[offset:],
2395 input_offset=self.state_machine.abs_line_offset() + 1,
2396 node=self.parent, initial_state='Explicit',
2397 blank_finish=blank_finish,
2398 match_titles=self.state_machine.match_titles)
2399 self.goto_line(newline_offset)
2400 if not blank_finish:
2401 self.parent += self.unindent_warning('Explicit markup')
2402
2403 def anonymous(self, match, context, next_state):
2404 """Anonymous hyperlink targets."""
2405 nodelist, blank_finish = self.anonymous_target(match)
2406 self.parent += nodelist
2407 self.explicit_list(blank_finish)
2408 return [], next_state, []
2409
2410 def anonymous_target(self, match):
2411 lineno = self.state_machine.abs_line_number()
2412 (block, indent, offset, blank_finish
2413 ) = self.state_machine.get_first_known_indented(match.end(),
2414 until_blank=True)
2415 blocktext = match.string[:match.end()] + '\n'.join(block)
2416 block = [escape2null(line) for line in block]
2417 target = self.make_target(block, blocktext, lineno, '')
2418 return [target], blank_finish
2419
2420 def line(self, match, context, next_state):
2421 """Section title overline or transition marker."""
2422 if self.state_machine.match_titles:
2423 return [match.string], 'Line', []
2424 elif match.string.strip() == '::':
2425 raise statemachine.TransitionCorrection('text')
2426 elif len(match.string.strip()) < 4:
2427 msg = self.reporter.info(
2428 'Unexpected possible title overline or transition.\n'
2429 "Treating it as ordinary text because it's so short.",
2430 line=self.state_machine.abs_line_number())
2431 self.parent += msg
2432 raise statemachine.TransitionCorrection('text')
2433 else:
2434 blocktext = self.state_machine.line
2435 msg = self.reporter.severe(
2436 'Unexpected section title or transition.',
2437 nodes.literal_block(blocktext, blocktext),
2438 line=self.state_machine.abs_line_number())
2439 self.parent += msg
2440 return [], next_state, []
2441
2442 def text(self, match, context, next_state):
2443 """Titles, definition lists, paragraphs."""
2444 return [match.string], 'Text', []
2445
2446
2447class RFC2822Body(Body):
2448
2449 """
2450 RFC2822 headers are only valid as the first constructs in documents. As
2451 soon as anything else appears, the `Body` state should take over.
2452 """
2453
2454 patterns = Body.patterns.copy() # can't modify the original
2455 patterns['rfc2822'] = r'[!-9;-~]+:( +|$)'
2456 initial_transitions = [(name, 'Body')
2457 for name in Body.initial_transitions]
2458 initial_transitions.insert(-1, ('rfc2822', 'Body')) # just before 'text'
2459
2460 def rfc2822(self, match, context, next_state):
2461 """RFC2822-style field list item."""
2462 fieldlist = nodes.field_list(classes=['rfc2822'])
2463 self.parent += fieldlist
2464 field, blank_finish = self.rfc2822_field(match)
2465 fieldlist += field
2466 offset = self.state_machine.line_offset + 1 # next line
2467 newline_offset, blank_finish = self.nested_list_parse(
2468 self.state_machine.input_lines[offset:],
2469 input_offset=self.state_machine.abs_line_offset() + 1,
2470 node=fieldlist, initial_state='RFC2822List',
2471 blank_finish=blank_finish)
2472 self.goto_line(newline_offset)
2473 if not blank_finish:
2474 self.parent += self.unindent_warning(
2475 'RFC2822-style field list')
2476 return [], next_state, []
2477
2478 def rfc2822_field(self, match):
2479 name = match.string[:match.string.find(':')]
2480 (indented, indent, line_offset, blank_finish
2481 ) = self.state_machine.get_first_known_indented(match.end(),
2482 until_blank=True)
2483 fieldnode = nodes.field()
2484 fieldnode += nodes.field_name(name, name)
2485 fieldbody = nodes.field_body('\n'.join(indented))
2486 fieldnode += fieldbody
2487 if indented:
2488 self.nested_parse(indented, input_offset=line_offset,
2489 node=fieldbody)
2490 return fieldnode, blank_finish
2491
2492
2493class SpecializedBody(Body):
2494
2495 """
2496 Superclass for second and subsequent compound element members. Compound
2497 elements are lists and list-like constructs.
2498
2499 All transition methods are disabled (redefined as `invalid_input`).
2500 Override individual methods in subclasses to re-enable.
2501
2502 For example, once an initial bullet list item, say, is recognized, the
2503 `BulletList` subclass takes over, with a "bullet_list" node as its
2504 container. Upon encountering the initial bullet list item, `Body.bullet`
2505 calls its ``self.nested_list_parse`` (`RSTState.nested_list_parse`), which
2506 starts up a nested parsing session with `BulletList` as the initial state.
2507 Only the ``bullet`` transition method is enabled in `BulletList`; as long
2508 as only bullet list items are encountered, they are parsed and inserted
2509 into the container. The first construct which is *not* a bullet list item
2510 triggers the `invalid_input` method, which ends the nested parse and
2511 closes the container. `BulletList` needs to recognize input that is
2512 invalid in the context of a bullet list, which means everything *other
2513 than* bullet list items, so it inherits the transition list created in
2514 `Body`.
2515 """
2516
2517 def invalid_input(self, match=None, context=None, next_state=None):
2518 """Not a compound element member. Abort this state machine."""
2519 self.state_machine.previous_line() # back up so parent SM can reassess
2520 raise EOFError
2521
2522 indent = invalid_input
2523 bullet = invalid_input
2524 enumerator = invalid_input
2525 field_marker = invalid_input
2526 option_marker = invalid_input
2527 doctest = invalid_input
2528 line_block = invalid_input
2529 grid_table_top = invalid_input
2530 simple_table_top = invalid_input
2531 explicit_markup = invalid_input
2532 anonymous = invalid_input
2533 line = invalid_input
2534 text = invalid_input
2535
2536
2537class BulletList(SpecializedBody):
2538
2539 """Second and subsequent bullet_list list_items."""
2540
2541 def bullet(self, match, context, next_state):
2542 """Bullet list item."""
2543 if match.string[0] != self.parent['bullet']:
2544 # different bullet: new list
2545 self.invalid_input()
2546 listitem, blank_finish = self.list_item(match.end())
2547 self.parent += listitem
2548 self.blank_finish = blank_finish
2549 return [], next_state, []
2550
2551
2552class DefinitionList(SpecializedBody):
2553
2554 """Second and subsequent definition_list_items."""
2555
2556 def text(self, match, context, next_state):
2557 """Definition lists."""
2558 return [match.string], 'Definition', []
2559
2560
2561class EnumeratedList(SpecializedBody):
2562
2563 """Second and subsequent enumerated_list list_items."""
2564
2565 def enumerator(self, match, context, next_state):
2566 """Enumerated list item."""
2567 format, sequence, text, ordinal = self.parse_enumerator(
2568 match, self.parent['enumtype'])
2569 if (format != self.format
2570 or (sequence != '#' and (sequence != self.parent['enumtype']
2571 or self.auto
2572 or ordinal != (self.lastordinal + 1)))
2573 or not self.is_enumerated_list_item(ordinal, sequence, format)):
2574 # different enumeration: new list
2575 self.invalid_input()
2576 if sequence == '#':
2577 self.auto = 1
2578 listitem, blank_finish = self.list_item(match.end())
2579 self.parent += listitem
2580 self.blank_finish = blank_finish
2581 self.lastordinal = ordinal
2582 return [], next_state, []
2583
2584
2585class FieldList(SpecializedBody):
2586
2587 """Second and subsequent field_list fields."""
2588
2589 def field_marker(self, match, context, next_state):
2590 """Field list field."""
2591 field, blank_finish = self.field(match)
2592 self.parent += field
2593 self.blank_finish = blank_finish
2594 return [], next_state, []
2595
2596
2597class OptionList(SpecializedBody):
2598
2599 """Second and subsequent option_list option_list_items."""
2600
2601 def option_marker(self, match, context, next_state):
2602 """Option list item."""
2603 try:
2604 option_list_item, blank_finish = self.option_list_item(match)
2605 except MarkupError:
2606 self.invalid_input()
2607 self.parent += option_list_item
2608 self.blank_finish = blank_finish
2609 return [], next_state, []
2610
2611
2612class RFC2822List(SpecializedBody, RFC2822Body):
2613
2614 """Second and subsequent RFC2822-style field_list fields."""
2615
2616 patterns = RFC2822Body.patterns
2617 initial_transitions = RFC2822Body.initial_transitions
2618
2619 def rfc2822(self, match, context, next_state):
2620 """RFC2822-style field list item."""
2621 field, blank_finish = self.rfc2822_field(match)
2622 self.parent += field
2623 self.blank_finish = blank_finish
2624 return [], 'RFC2822List', []
2625
2626 blank = SpecializedBody.invalid_input
2627
2628
2629class ExtensionOptions(FieldList):
2630
2631 """
2632 Parse field_list fields for extension options.
2633
2634 No nested parsing is done (including inline markup parsing).
2635 """
2636
2637 def parse_field_body(self, indented, offset, node) -> None:
2638 """Override `Body.parse_field_body` for simpler parsing."""
2639 lines = []
2640 for line in list(indented) + ['']:
2641 if line.strip():
2642 lines.append(line)
2643 elif lines:
2644 text = '\n'.join(lines)
2645 node += nodes.paragraph(text, text)
2646 lines = []
2647
2648
2649class LineBlock(SpecializedBody):
2650
2651 """Second and subsequent lines of a line_block."""
2652
2653 blank = SpecializedBody.invalid_input
2654
2655 def line_block(self, match, context, next_state):
2656 """New line of line block."""
2657 lineno = self.state_machine.abs_line_number()
2658 line, messages, blank_finish = self.line_block_line(match, lineno)
2659 self.parent += line
2660 self.parent.parent += messages
2661 self.blank_finish = blank_finish
2662 return [], next_state, []
2663
2664
2665class Explicit(SpecializedBody):
2666
2667 """Second and subsequent explicit markup construct."""
2668
2669 def explicit_markup(self, match, context, next_state):
2670 """Footnotes, hyperlink targets, directives, comments."""
2671 nodelist, blank_finish = self.explicit_construct(match)
2672 self.parent += nodelist
2673 self.blank_finish = blank_finish
2674 return [], next_state, []
2675
2676 def anonymous(self, match, context, next_state):
2677 """Anonymous hyperlink targets."""
2678 nodelist, blank_finish = self.anonymous_target(match)
2679 self.parent += nodelist
2680 self.blank_finish = blank_finish
2681 return [], next_state, []
2682
2683 blank = SpecializedBody.invalid_input
2684
2685
2686class SubstitutionDef(Body):
2687
2688 """
2689 Parser for the contents of a substitution_definition element.
2690 """
2691
2692 patterns = {
2693 'embedded_directive': re.compile(r'(%s)::( +|$)'
2694 % Inliner.simplename),
2695 'text': r''}
2696 initial_transitions = ['embedded_directive', 'text']
2697
2698 def embedded_directive(self, match, context, next_state):
2699 nodelist, blank_finish = self.directive(match,
2700 alt=self.parent['names'][0])
2701 self.parent += nodelist
2702 if not self.state_machine.at_eof():
2703 self.blank_finish = blank_finish
2704 raise EOFError
2705
2706 def text(self, match, context, next_state):
2707 if not self.state_machine.at_eof():
2708 self.blank_finish = self.state_machine.is_next_line_blank()
2709 raise EOFError
2710
2711
2712class Text(RSTState):
2713
2714 """
2715 Classifier of second line of a text block.
2716
2717 Could be a paragraph, a definition list item, or a title.
2718 """
2719
2720 patterns = {'underline': Body.patterns['line'],
2721 'text': r''}
2722 initial_transitions = [('underline', 'Body'), ('text', 'Body')]
2723
2724 def blank(self, match, context, next_state):
2725 """End of paragraph."""
2726 # NOTE: self.paragraph returns [node, system_message(s)], literalnext
2727 paragraph, literalnext = self.paragraph(
2728 context, self.state_machine.abs_line_number() - 1)
2729 self.parent += paragraph
2730 if literalnext:
2731 self.parent += self.literal_block()
2732 return [], 'Body', []
2733
2734 def eof(self, context):
2735 if context:
2736 self.blank(None, context, None)
2737 return []
2738
2739 def indent(self, match, context, next_state):
2740 """Definition list item."""
2741 dl = nodes.definition_list()
2742 # the definition list starts on the line before the indent:
2743 lineno = self.state_machine.abs_line_number() - 1
2744 dl.source, dl.line = self.state_machine.get_source_and_line(lineno)
2745 dl_item, blank_finish = self.definition_list_item(context)
2746 dl += dl_item
2747 self.parent += dl
2748 offset = self.state_machine.line_offset + 1 # next line
2749 newline_offset, blank_finish = self.nested_list_parse(
2750 self.state_machine.input_lines[offset:],
2751 input_offset=self.state_machine.abs_line_offset() + 1,
2752 node=dl, initial_state='DefinitionList',
2753 blank_finish=blank_finish, blank_finish_state='Definition')
2754 self.goto_line(newline_offset)
2755 if not blank_finish:
2756 self.parent += self.unindent_warning('Definition list')
2757 return [], 'Body', []
2758
2759 def underline(self, match, context, next_state):
2760 """Section title."""
2761 lineno = self.state_machine.abs_line_number()
2762 title = context[0].rstrip()
2763 underline = match.string.rstrip()
2764 source = title + '\n' + underline
2765 messages = []
2766 if column_width(title) > len(underline):
2767 if len(underline) < 4:
2768 if self.state_machine.match_titles:
2769 msg = self.reporter.info(
2770 'Possible title underline, too short for the title.\n'
2771 "Treating it as ordinary text because it's so short.",
2772 line=lineno)
2773 self.parent += msg
2774 raise statemachine.TransitionCorrection('text')
2775 else:
2776 blocktext = context[0] + '\n' + self.state_machine.line
2777 msg = self.reporter.warning(
2778 'Title underline too short.',
2779 nodes.literal_block(blocktext, blocktext),
2780 line=lineno)
2781 messages.append(msg)
2782 if not self.state_machine.match_titles:
2783 blocktext = context[0] + '\n' + self.state_machine.line
2784 # We need get_source_and_line() here to report correctly
2785 src, srcline = self.state_machine.get_source_and_line()
2786 # TODO: why is abs_line_number() == srcline+1
2787 # if the error is in a table (try with test_tables.py)?
2788 # print("get_source_and_line", srcline)
2789 # print("abs_line_number", self.state_machine.abs_line_number())
2790 msg = self.reporter.severe(
2791 'Unexpected section title.',
2792 nodes.literal_block(blocktext, blocktext),
2793 source=src, line=srcline)
2794 self.parent += messages
2795 self.parent += msg
2796 return [], next_state, []
2797 style = underline[0]
2798 context[:] = []
2799 self.section(title, source, style, lineno - 1, messages)
2800 return [], next_state, []
2801
2802 def text(self, match, context, next_state):
2803 """Paragraph."""
2804 startline = self.state_machine.abs_line_number() - 1
2805 msg = None
2806 try:
2807 block = self.state_machine.get_text_block(flush_left=True)
2808 except statemachine.UnexpectedIndentationError as err:
2809 block, src, srcline = err.args
2810 msg = self.reporter.error('Unexpected indentation.',
2811 source=src, line=srcline)
2812 lines = context + list(block)
2813 paragraph, literalnext = self.paragraph(lines, startline)
2814 self.parent += paragraph
2815 self.parent += msg
2816 if literalnext:
2817 try:
2818 self.state_machine.next_line()
2819 except EOFError:
2820 pass
2821 self.parent += self.literal_block()
2822 return [], next_state, []
2823
2824 def literal_block(self):
2825 """Return a list of nodes."""
2826 (indented, indent, offset, blank_finish
2827 ) = self.state_machine.get_indented()
2828 while indented and not indented[-1].strip():
2829 indented.trim_end()
2830 if not indented:
2831 return self.quoted_literal_block()
2832 data = '\n'.join(indented)
2833 literal_block = nodes.literal_block(data, data)
2834 (literal_block.source,
2835 literal_block.line) = self.state_machine.get_source_and_line(offset+1)
2836 nodelist = [literal_block]
2837 if not blank_finish:
2838 nodelist.append(self.unindent_warning('Literal block'))
2839 return nodelist
2840
2841 def quoted_literal_block(self):
2842 abs_line_offset = self.state_machine.abs_line_offset()
2843 offset = self.state_machine.line_offset
2844 parent_node = nodes.Element()
2845 new_abs_offset = self.nested_parse(
2846 self.state_machine.input_lines[offset:],
2847 input_offset=abs_line_offset, node=parent_node, match_titles=False,
2848 state_machine_kwargs={'state_classes': (QuotedLiteralBlock,),
2849 'initial_state': 'QuotedLiteralBlock'})
2850 self.goto_line(new_abs_offset)
2851 return parent_node.children
2852
2853 def definition_list_item(self, termline):
2854 # the parser is already on the second (indented) line:
2855 dd_lineno = self.state_machine.abs_line_number()
2856 dt_lineno = dd_lineno - 1
2857 (indented, indent, line_offset, blank_finish
2858 ) = self.state_machine.get_indented()
2859 dl_item = nodes.definition_list_item(
2860 '\n'.join(termline + list(indented)))
2861 (dl_item.source,
2862 dl_item.line) = self.state_machine.get_source_and_line(dt_lineno)
2863 dt_nodes, messages = self.term(termline, dt_lineno)
2864 dl_item += dt_nodes
2865 dd = nodes.definition('', *messages)
2866 dd.source, dd.line = self.state_machine.get_source_and_line(dd_lineno)
2867 dl_item += dd
2868 if termline[0][-2:] == '::':
2869 dd += self.reporter.info(
2870 'Blank line missing before literal block (after the "::")? '
2871 'Interpreted as a definition list item.',
2872 line=dd_lineno)
2873 # TODO: drop a definition if it is an empty comment to allow
2874 # definition list items with several terms?
2875 # https://sourceforge.net/p/docutils/feature-requests/60/
2876 self.nested_parse(indented, input_offset=line_offset, node=dd)
2877 return dl_item, blank_finish
2878
2879 classifier_delimiter = re.compile(' +: +')
2880
2881 def term(self, lines, lineno):
2882 """Return a definition_list's term and optional classifiers."""
2883 assert len(lines) == 1
2884 text_nodes, messages = self.inline_text(lines[0], lineno)
2885 dt = nodes.term(lines[0])
2886 dt.source, dt.line = self.state_machine.get_source_and_line(lineno)
2887 node_list = [dt]
2888 for i in range(len(text_nodes)):
2889 node = text_nodes[i]
2890 if isinstance(node, nodes.Text):
2891 parts = self.classifier_delimiter.split(node)
2892 if len(parts) == 1:
2893 node_list[-1] += node
2894 else:
2895 text = parts[0].rstrip()
2896 textnode = nodes.Text(text)
2897 node_list[-1] += textnode
2898 node_list += [nodes.classifier(unescape(part, True), part)
2899 for part in parts[1:]]
2900 else:
2901 node_list[-1] += node
2902 return node_list, messages
2903
2904
2905class SpecializedText(Text):
2906
2907 """
2908 Superclass for second and subsequent lines of Text-variants.
2909
2910 All transition methods are disabled. Override individual methods in
2911 subclasses to re-enable.
2912 """
2913
2914 def eof(self, context):
2915 """Incomplete construct."""
2916 return []
2917
2918 def invalid_input(self, match=None, context=None, next_state=None):
2919 """Not a compound element member. Abort this state machine."""
2920 raise EOFError
2921
2922 blank = invalid_input
2923 indent = invalid_input
2924 underline = invalid_input
2925 text = invalid_input
2926
2927
2928class Definition(SpecializedText):
2929
2930 """Second line of potential definition_list_item."""
2931
2932 def eof(self, context):
2933 """Not a definition."""
2934 self.state_machine.previous_line(2) # so parent SM can reassess
2935 return []
2936
2937 def indent(self, match, context, next_state):
2938 """Definition list item."""
2939 dl_item, blank_finish = self.definition_list_item(context)
2940 self.parent += dl_item
2941 self.blank_finish = blank_finish
2942 return [], 'DefinitionList', []
2943
2944
2945class Line(SpecializedText):
2946
2947 """
2948 Second line of over- & underlined section title or transition marker.
2949 """
2950
2951 eofcheck = 1 # @@@ ???
2952 """Set to 0 while parsing sections, so that we don't catch the EOF."""
2953
2954 def eof(self, context):
2955 """Transition marker at end of section or document."""
2956 marker = context[0].strip()
2957 if self.memo.section_bubble_up_kludge:
2958 self.memo.section_bubble_up_kludge = False
2959 elif len(marker) < 4:
2960 self.state_correction(context)
2961 if self.eofcheck: # ignore EOFError with sections
2962 src, srcline = self.state_machine.get_source_and_line()
2963 # lineno = self.state_machine.abs_line_number() - 1
2964 transition = nodes.transition(rawsource=context[0])
2965 transition.source = src
2966 transition.line = srcline - 1
2967 # transition.line = lineno
2968 self.parent += transition
2969 self.eofcheck = 1
2970 return []
2971
2972 def blank(self, match, context, next_state):
2973 """Transition marker."""
2974 src, srcline = self.state_machine.get_source_and_line()
2975 marker = context[0].strip()
2976 if len(marker) < 4:
2977 self.state_correction(context)
2978 transition = nodes.transition(rawsource=marker)
2979 transition.source = src
2980 transition.line = srcline - 1
2981 self.parent += transition
2982 return [], 'Body', []
2983
2984 def text(self, match, context, next_state):
2985 """Potential over- & underlined title."""
2986 lineno = self.state_machine.abs_line_number() - 1
2987 overline = context[0]
2988 title = match.string
2989 underline = ''
2990 try:
2991 underline = self.state_machine.next_line()
2992 except EOFError:
2993 blocktext = overline + '\n' + title
2994 if len(overline.rstrip()) < 4:
2995 self.short_overline(context, blocktext, lineno, 2)
2996 else:
2997 msg = self.reporter.severe(
2998 'Incomplete section title.',
2999 nodes.literal_block(blocktext, blocktext),
3000 line=lineno)
3001 self.parent += msg
3002 return [], 'Body', []
3003 source = '%s\n%s\n%s' % (overline, title, underline)
3004 overline = overline.rstrip()
3005 underline = underline.rstrip()
3006 if not self.transitions['underline'][0].match(underline):
3007 blocktext = overline + '\n' + title + '\n' + underline
3008 if len(overline.rstrip()) < 4:
3009 self.short_overline(context, blocktext, lineno, 2)
3010 else:
3011 msg = self.reporter.severe(
3012 'Missing matching underline for section title overline.',
3013 nodes.literal_block(source, source),
3014 line=lineno)
3015 self.parent += msg
3016 return [], 'Body', []
3017 elif overline != underline:
3018 blocktext = overline + '\n' + title + '\n' + underline
3019 if len(overline.rstrip()) < 4:
3020 self.short_overline(context, blocktext, lineno, 2)
3021 else:
3022 msg = self.reporter.severe(
3023 'Title overline & underline mismatch.',
3024 nodes.literal_block(source, source),
3025 line=lineno)
3026 self.parent += msg
3027 return [], 'Body', []
3028 title = title.rstrip()
3029 messages = []
3030 if column_width(title) > len(overline):
3031 blocktext = overline + '\n' + title + '\n' + underline
3032 if len(overline.rstrip()) < 4:
3033 self.short_overline(context, blocktext, lineno, 2)
3034 else:
3035 msg = self.reporter.warning(
3036 'Title overline too short.',
3037 nodes.literal_block(source, source),
3038 line=lineno)
3039 messages.append(msg)
3040 style = (overline[0], underline[0])
3041 self.eofcheck = 0 # @@@ not sure this is correct
3042 self.section(title.lstrip(), source, style, lineno + 1, messages)
3043 self.eofcheck = 1
3044 return [], 'Body', []
3045
3046 indent = text # indented title
3047
3048 def underline(self, match, context, next_state):
3049 overline = context[0]
3050 blocktext = overline + '\n' + self.state_machine.line
3051 lineno = self.state_machine.abs_line_number() - 1
3052 if len(overline.rstrip()) < 4:
3053 self.short_overline(context, blocktext, lineno, 1)
3054 msg = self.reporter.error(
3055 'Invalid section title or transition marker.',
3056 nodes.literal_block(blocktext, blocktext),
3057 line=lineno)
3058 self.parent += msg
3059 return [], 'Body', []
3060
3061 def short_overline(self, context, blocktext, lineno, lines=1) -> None:
3062 msg = self.reporter.info(
3063 'Possible incomplete section title.\nTreating the overline as '
3064 "ordinary text because it's so short.",
3065 line=lineno)
3066 self.parent += msg
3067 self.state_correction(context, lines)
3068
3069 def state_correction(self, context, lines=1):
3070 self.state_machine.previous_line(lines)
3071 context[:] = []
3072 raise statemachine.StateCorrection('Body', 'text')
3073
3074
3075class QuotedLiteralBlock(RSTState):
3076
3077 """
3078 Nested parse handler for quoted (unindented) literal blocks.
3079
3080 Special-purpose. Not for inclusion in `state_classes`.
3081 """
3082
3083 patterns = {'initial_quoted': r'(%(nonalphanum7bit)s)' % Body.pats,
3084 'text': r''}
3085 initial_transitions = ('initial_quoted', 'text')
3086
3087 def __init__(self, state_machine, debug=False) -> None:
3088 RSTState.__init__(self, state_machine, debug)
3089 self.messages = []
3090 self.initial_lineno = None
3091
3092 def blank(self, match, context, next_state):
3093 if context:
3094 raise EOFError
3095 else:
3096 return context, next_state, []
3097
3098 def eof(self, context):
3099 if context:
3100 src, srcline = self.state_machine.get_source_and_line(
3101 self.initial_lineno)
3102 text = '\n'.join(context)
3103 literal_block = nodes.literal_block(text, text)
3104 literal_block.source = src
3105 literal_block.line = srcline
3106 self.parent += literal_block
3107 else:
3108 self.parent += self.reporter.warning(
3109 'Literal block expected; none found.',
3110 line=self.state_machine.abs_line_number()
3111 ) # src not available, statemachine.input_lines is empty
3112 self.state_machine.previous_line()
3113 self.parent += self.messages
3114 return []
3115
3116 def indent(self, match, context, next_state):
3117 assert context, ('QuotedLiteralBlock.indent: context should not '
3118 'be empty!')
3119 self.messages.append(
3120 self.reporter.error('Unexpected indentation.',
3121 line=self.state_machine.abs_line_number()))
3122 self.state_machine.previous_line()
3123 raise EOFError
3124
3125 def initial_quoted(self, match, context, next_state):
3126 """Match arbitrary quote character on the first line only."""
3127 self.remove_transition('initial_quoted')
3128 quote = match.string[0]
3129 pattern = re.compile(re.escape(quote))
3130 # New transition matches consistent quotes only:
3131 self.add_transition('quoted',
3132 (pattern, self.quoted, self.__class__.__name__))
3133 self.initial_lineno = self.state_machine.abs_line_number()
3134 return [match.string], next_state, []
3135
3136 def quoted(self, match, context, next_state):
3137 """Match consistent quotes on subsequent lines."""
3138 context.append(match.string)
3139 return context, next_state, []
3140
3141 def text(self, match, context, next_state):
3142 if context:
3143 self.messages.append(
3144 self.reporter.error('Inconsistent literal block quoting.',
3145 line=self.state_machine.abs_line_number()))
3146 self.state_machine.previous_line()
3147 raise EOFError
3148
3149
3150state_classes = (Body, BulletList, DefinitionList, EnumeratedList, FieldList,
3151 OptionList, LineBlock, ExtensionOptions, Explicit, Text,
3152 Definition, Line, SubstitutionDef, RFC2822Body, RFC2822List)
3153"""Standard set of State classes used to start `RSTStateMachine`."""