1# $Id$
2# Author: David Goodger <goodger@python.org>
3# Copyright: This module has been placed in the public domain.
4
5"""
6This is ``docutils.parsers.rst`` package. It exports a single class, `Parser`,
7the reStructuredText parser.
8
9
10Usage
11=====
12
131. Create a parser::
14
15 parser = docutils.parsers.rst.Parser()
16
17 Several optional arguments may be passed to modify the parser's behavior.
18 Please see `Customizing the Parser`_ below for details.
19
202. Gather input (a multi-line string), by reading a file or the standard
21 input::
22
23 input = sys.stdin.read()
24
253. Create a new empty `docutils.nodes.document` tree::
26
27 document = docutils.utils.new_document(source, settings)
28
29 See `docutils.utils.new_document()` for parameter details.
30
314. Run the parser, populating the document tree::
32
33 parser.parse(input, document)
34
35
36Parser Overview
37===============
38
39The reStructuredText parser is implemented as a state machine, examining its
40input one line at a time. To understand how the parser works, please first
41become familiar with the `docutils.statemachine` module, then see the
42`states` module.
43
44
45Customizing the Parser
46----------------------
47
48Anything that isn't already customizable is that way simply because that type
49of customizability hasn't been implemented yet. Patches welcome!
50
51When instantiating an object of the `Parser` class, two parameters may be
52passed: ``rfc2822`` and ``inliner``. Pass ``rfc2822=True`` to enable an
53initial RFC-2822 style header block, parsed as a "field_list" element (with
54"class" attribute set to "rfc2822"). Currently this is the only body-level
55element which is customizable without subclassing. (Tip: subclass `Parser`
56and change its "state_classes" and "initial_state" attributes to refer to new
57classes. Contact the author if you need more details.)
58
59The ``inliner`` parameter takes an instance of `states.Inliner` or a subclass.
60It handles inline markup recognition. A common extension is the addition of
61further implicit hyperlinks, like "RFC 2822". This can be done by subclassing
62`states.Inliner`, adding a new method for the implicit markup, and adding a
63``(pattern, method)`` pair to the "implicit_dispatch" attribute of the
64subclass. See `states.Inliner.implicit_inline()` for details. Explicit
65inline markup can be customized in a `states.Inliner` subclass via the
66``patterns.initial`` and ``dispatch`` attributes (and new methods as
67appropriate).
68"""
69
70from __future__ import annotations
71
72__docformat__ = 'reStructuredText'
73
74import docutils.parsers
75import docutils.statemachine
76from docutils.parsers.rst import roles, states
77from docutils import frontend, nodes
78from docutils.transforms import universal
79
80
81class Parser(docutils.parsers.Parser):
82
83 """The reStructuredText parser."""
84
85 supported = ('rst', 'restructuredtext', 'rest', 'restx', 'rtxt', 'rstx')
86 """Aliases this parser supports."""
87
88 settings_spec = docutils.parsers.Parser.settings_spec + (
89 'reStructuredText Parser Options',
90 None,
91 (('Recognize and link to standalone PEP references (like "PEP 258").',
92 ['--pep-references'],
93 {'action': 'store_true', 'validator': frontend.validate_boolean}),
94 ('Base URL for PEP references '
95 '(default "https://peps.python.org/").',
96 ['--pep-base-url'],
97 {'metavar': '<URL>', 'default': 'https://peps.python.org/',
98 'validator': frontend.validate_url_trailing_slash}),
99 ('Template for PEP file part of URL. (default "pep-%04d")',
100 ['--pep-file-url-template'],
101 {'metavar': '<URL>', 'default': 'pep-%04d'}),
102 ('Recognize and link to standalone RFC references (like "RFC 822").',
103 ['--rfc-references'],
104 {'action': 'store_true', 'validator': frontend.validate_boolean}),
105 ('Base URL for RFC references '
106 '(default "https://tools.ietf.org/html/").',
107 ['--rfc-base-url'],
108 {'metavar': '<URL>', 'default': 'https://tools.ietf.org/html/',
109 'validator': frontend.validate_url_trailing_slash}),
110 ('Set number of spaces for tab expansion (default 8).',
111 ['--tab-width'],
112 {'metavar': '<width>', 'type': 'int', 'default': 8,
113 'validator': frontend.validate_nonnegative_int}),
114 ('Remove spaces before footnote references.',
115 ['--trim-footnote-reference-space'],
116 {'action': 'store_true', 'validator': frontend.validate_boolean}),
117 ('Leave spaces before footnote references.',
118 ['--leave-footnote-reference-space'],
119 {'action': 'store_false', 'dest': 'trim_footnote_reference_space'}),
120 ('Token name set for parsing code with Pygments: one of '
121 '"long", "short", or "none" (no parsing). Default is "long".',
122 ['--syntax-highlight'],
123 {'choices': ['long', 'short', 'none'],
124 'default': 'long', 'metavar': '<format>'}),
125 ('Change straight quotation marks to typographic form: '
126 'one of "yes", "no", "alt[ernative]" (default "no").',
127 ['--smart-quotes'],
128 {'default': False, 'metavar': '<yes/no/alt>',
129 'validator': frontend.validate_ternary}),
130 ('Characters to use as "smart quotes" for <language>. ',
131 ['--smartquotes-locales'],
132 {'metavar': '<language:quotes[,language:quotes,...]>',
133 'action': 'append',
134 'validator': frontend.validate_smartquotes_locales}),
135 ('Inline markup recognized at word boundaries only '
136 '(adjacent to punctuation or whitespace). '
137 'Force character-level inline markup recognition with '
138 '"\\ " (backslash + space). Default.',
139 ['--word-level-inline-markup'],
140 {'action': 'store_false', 'dest': 'character_level_inline_markup'}),
141 ('Inline markup recognized anywhere, regardless of surrounding '
142 'characters. Backslash-escapes must be used to avoid unwanted '
143 'markup recognition. Useful for East Asian languages. '
144 'Experimental.',
145 ['--character-level-inline-markup'],
146 {'action': 'store_true', 'default': False,
147 'dest': 'character_level_inline_markup'}),
148 )
149 )
150
151 config_section = 'restructuredtext parser'
152 config_section_dependencies = ('parsers',)
153
154 def __init__(self, rfc2822=False, inliner=None) -> None:
155 if rfc2822:
156 self.initial_state = 'RFC2822Body'
157 else:
158 self.initial_state = 'Body'
159 self.state_classes = states.state_classes
160 self.inliner = inliner
161
162 def get_transforms(self):
163 return super().get_transforms() + [universal.SmartQuotes]
164
165 def parse(self, inputstring, document) -> None:
166 """Parse `inputstring` and populate `document`, a document tree."""
167 self.setup_parse(inputstring, document)
168 # provide fallbacks in case the document has only generic settings
169 self.document.settings.setdefault('tab_width', 8)
170 self.document.settings.setdefault('syntax_highlight', 'long')
171 self.statemachine = states.RSTStateMachine(
172 state_classes=self.state_classes,
173 initial_state=self.initial_state,
174 debug=document.reporter.debug_flag)
175 inputlines = docutils.statemachine.string2lines(
176 inputstring, tab_width=document.settings.tab_width,
177 convert_whitespace=True)
178 for i, line in enumerate(inputlines):
179 if len(line) > self.document.settings.line_length_limit:
180 error = self.document.reporter.error(
181 'Line %d exceeds the line-length-limit.'%(i+1))
182 self.document.append(error)
183 break
184 else:
185 self.statemachine.run(inputlines, document, inliner=self.inliner)
186 # restore the "default" default role after parsing a document
187 if '' in roles._roles:
188 del roles._roles['']
189 self.finish_parse()
190
191
192class DirectiveError(Exception):
193
194 """
195 Store a message and a system message level.
196
197 To be thrown from inside directive code.
198
199 Do not instantiate directly -- use `Directive.directive_error()`
200 instead!
201 """
202
203 def __init__(self, level, message) -> None:
204 """Set error `message` and `level`"""
205 Exception.__init__(self)
206 self.level = level
207 self.msg = message
208
209
210class Directive:
211
212 """
213 Base class for reStructuredText directives.
214
215 The following attributes may be set by subclasses. They are
216 interpreted by the directive parser (which runs the directive
217 class):
218
219 - `required_arguments`: The number of required arguments (default:
220 0).
221
222 - `optional_arguments`: The number of optional arguments (default:
223 0).
224
225 - `final_argument_whitespace`: A boolean, indicating if the final
226 argument may contain whitespace (default: False).
227
228 - `option_spec`: A dictionary, mapping known option names to
229 conversion functions such as `int` or `float` (default: {}, no
230 options). Several conversion functions are defined in the
231 directives/__init__.py module.
232
233 Option conversion functions take a single parameter, the option
234 argument (a string or ``None``), validate it and/or convert it
235 to the appropriate form. Conversion functions may raise
236 `ValueError` and `TypeError` exceptions.
237
238 - `has_content`: A boolean; True if content is allowed. Client
239 code must handle the case where content is required but not
240 supplied (an empty content list will be supplied).
241
242 Arguments are normally single whitespace-separated words. The
243 final argument may contain whitespace and/or newlines if
244 `final_argument_whitespace` is True.
245
246 If the form of the arguments is more complex, specify only one
247 argument (either required or optional) and set
248 `final_argument_whitespace` to True; the client code must do any
249 context-sensitive parsing.
250
251 When a directive implementation is being run, the directive class
252 is instantiated, and the `run()` method is executed. During
253 instantiation, the following instance variables are set:
254
255 - ``name`` is the directive type or name (string).
256
257 - ``arguments`` is the list of positional arguments (strings).
258
259 - ``options`` is a dictionary mapping option names (strings) to
260 values (type depends on option conversion functions; see
261 `option_spec` above).
262
263 - ``content`` is a list of strings, the directive content line by line.
264
265 - ``lineno`` is the absolute line number of the first line
266 of the directive.
267
268 - ``content_offset`` is the line offset of the first line
269 of the content from the beginning of the current input.
270 Used when initiating a nested parse.
271
272 - ``block_text`` is a string containing the entire directive.
273
274 - ``state`` is the state which called the directive function.
275
276 - ``state_machine`` is the state machine which controls the state
277 which called the directive function.
278
279 - ``reporter`` is the state machine's `reporter` instance.
280
281 Directive functions return a list of nodes which will be inserted
282 into the document tree at the point where the directive was
283 encountered. This can be an empty list if there is nothing to
284 insert.
285
286 For ordinary directives, the list must contain body elements or
287 structural elements. Some directives are intended specifically
288 for substitution definitions, and must return a list of `Text`
289 nodes and/or inline elements (suitable for inline insertion, in
290 place of the substitution reference). Such directives must verify
291 substitution definition context, typically using code like this::
292
293 if not isinstance(state, states.SubstitutionDef):
294 error = self.reporter.error(
295 'Invalid context: the "%s" directive can only be used '
296 'within a substitution definition.' % (name),
297 nodes.literal_block(block_text, block_text), line=lineno)
298 return [error]
299 """
300
301 # There is a "Creating reStructuredText Directives" how-to at
302 # <https://docutils.sourceforge.io/docs/howto/rst-directives.html>. If you
303 # update this docstring, please update the how-to as well.
304
305 required_arguments = 0
306 """Number of required directive arguments."""
307
308 optional_arguments = 0
309 """Number of optional arguments after the required arguments."""
310
311 final_argument_whitespace = False
312 """May the final argument contain whitespace?"""
313
314 option_spec = None
315 """Mapping of option names to validator functions."""
316
317 has_content = False
318 """May the directive have content?"""
319
320 def __init__(self, name, arguments, options, content, lineno,
321 content_offset, block_text, state, state_machine) -> None:
322 self.name = name
323 self.arguments = arguments
324 self.options = options
325 self.content = content
326 self.lineno = lineno
327 self.content_offset = content_offset
328 self.block_text = block_text
329 self.state = state
330 self.state_machine = state_machine
331 self.reporter = state_machine.reporter
332
333 def run(self):
334 raise NotImplementedError('Must override run() in subclass.')
335
336 # Directive errors:
337
338 def directive_error(self, level, message):
339 """
340 Return a DirectiveError suitable for being thrown as an exception.
341
342 Call "raise self.directive_error(level, message)" from within
343 a directive implementation to return one single system message
344 at level `level`, which automatically gets the directive block
345 and the line number added.
346
347 Preferably use the `debug`, `info`, `warning`, `error`, or `severe`
348 wrapper methods, e.g. ``self.error(message)`` to generate an
349 ERROR-level directive error.
350 """
351 return DirectiveError(level, message)
352
353 def debug(self, message):
354 return self.directive_error(0, message)
355
356 def info(self, message):
357 return self.directive_error(1, message)
358
359 def warning(self, message):
360 return self.directive_error(2, message)
361
362 def error(self, message):
363 return self.directive_error(3, message)
364
365 def severe(self, message):
366 return self.directive_error(4, message)
367
368 # Convenience methods:
369
370 def assert_has_content(self):
371 """
372 Throw an ERROR-level DirectiveError if the directive doesn't
373 have contents.
374 """
375 if not self.content:
376 raise self.error('Content block expected for the "%s" directive; '
377 'none found.' % self.name)
378
379 def add_name(self, node) -> None:
380 """Append self.options['name'] to node['names'] if it exists.
381
382 Also normalize the name string and register it as explicit target.
383 """
384 if 'name' in self.options:
385 name = nodes.fully_normalize_name(self.options.pop('name'))
386 if 'name' in node:
387 del node['name']
388 node['names'].append(name)
389 self.state.document.note_explicit_target(node, node)
390
391
392def convert_directive_function(directive_fn):
393 """
394 Define & return a directive class generated from `directive_fn`.
395
396 `directive_fn` uses the old-style, functional interface.
397 """
398
399 class FunctionalDirective(Directive):
400
401 option_spec = getattr(directive_fn, 'options', None)
402 has_content = getattr(directive_fn, 'content', False)
403 _argument_spec = getattr(directive_fn, 'arguments', (0, 0, False))
404 required_arguments, optional_arguments, final_argument_whitespace \
405 = _argument_spec
406
407 def run(self):
408 return directive_fn(
409 self.name, self.arguments, self.options, self.content,
410 self.lineno, self.content_offset, self.block_text,
411 self.state, self.state_machine)
412
413 # Return new-style directive.
414 return FunctionalDirective