osm-labo/venv/lib/python3.12/site-packages/docutils/parsers/docutils_xml.py
2025-08-31 12:29:41 +02:00

194 lines
6.4 KiB
Python

#! /usr/bin/env python3
# :Copyright: © 2024 Günter Milde.
# :License: Released under the terms of the `2-Clause BSD license`_, in short:
#
# Copying and distribution of this file, with or without modification,
# are permitted in any medium without royalty provided the copyright
# notice and this notice are preserved.
# This file is offered as-is, without any warranty.
#
# .. _2-Clause BSD license: https://opensource.org/licenses/BSD-2-Clause
#
# Revision: $Revision: 10136 $
# Date: $Date: 2025-05-20 17:48:27 +0200 (Di, 20. Mai 2025) $
"""A Docutils-XML parser.
Provisional:
The API is not fixed yet.
Defined objects may be renamed or changed
in any Docutils release without prior notice.
"""
from __future__ import annotations
__docformat__ = 'reStructuredText'
import re
import xml.etree.ElementTree as ET
from docutils import frontend, nodes, parsers, utils
class Parser(parsers.Parser):
"""A Docutils-XML parser."""
supported = ('xml', 'docutils-xml')
"""Aliases this parser supports."""
config_section = 'xml parser'
config_section_dependencies = ('parsers',)
settings_default_overrides = {'doctitle_xform': False,
'validate': True,
}
def parse(self, inputstring, document) -> None:
"""
Parse `inputstring` and populate `document`, a "document tree".
Provisional.
"""
self.setup_parse(inputstring, document)
node = parse_element(inputstring, document)
if not isinstance(node, nodes.document):
document.append(node)
self.finish_parse()
class Unknown(nodes.Special, nodes.Inline, nodes.Element):
"""An unknown element found by the XML parser."""
content_model = (((nodes.Element, nodes.Text), '*'),) # no restrictions
def parse_element(inputstring, document=None):
"""
Parse `inputstring` as "Docutils XML", return `nodes.Element` instance.
:inputstring: XML source.
:document: `nodes.document` instance (default: a new dummy instance).
Provides settings and reporter.
Populated and returned, if the inputstring's root element
is <document>.
Caution:
The function does not detect invalid XML.
To check the validity of the returned node,
you may use its `validate()` method::
node = parse_element('<tip><hint>text</hint></tip>')
node.validate()
Provisional.
"""
root = None
parser = ET.XMLPullParser(events=('start',))
for i, line in enumerate(inputstring.splitlines(keepends=True)):
try:
parser.feed(line)
for event, element in parser.read_events():
if root is None:
root = element
element.attrib['source line'] = str(i+1)
except ET.ParseError as e:
if document is None:
raise
document.reporter.error(f'XML parse error: {e}.',
source=document.settings._source,
line=e.position[0])
break
return element2node(root, document)
def element2node(element, document=None, unindent=True):
"""
Convert an `etree` element and its children to Docutils doctree nodes.
:element: `xml.etree` element
:document: see `parse_element()`
:unindent: Remove formatting indentation of follow-up text lines?
Cf. `append_text()`.
TODO: do we need an "unindent" configuration setting?
Return a `docutils.nodes.Element` instance.
Internal.
"""
if document is None:
document = utils.new_document('xml input',
frontend.get_default_settings(Parser))
document.source == 'xml input'
if element is None:
problem = nodes.problematic('', 'No XML element found.')
return nodes.paragraph('', '', problem)
# Get the corresponding `nodes.Element` instance:
try:
nodeclass = getattr(nodes, element.tag)
if not issubclass(nodeclass, nodes.Element):
nodeclass = Unknown
except AttributeError:
nodeclass = Unknown
if nodeclass == nodes.document:
node = document
document.source = document.source or document.settings._source
else:
node = nodeclass()
node.line = int(element.get('source line'))
if isinstance(node, Unknown):
node.tagname = element.tag
document.reporter.warning(
f'Unknown element type <{element.tag}>.',
base_node=node)
# Attributes: convert and add to `node.attributes`.
for key, value in element.items():
if key.startswith('{') or key == 'source line':
continue # skip duplicate attributes with namespace URL
try:
node.attributes[key] = nodes.ATTRIBUTE_VALIDATORS[key](value)
except (ValueError, KeyError):
if key in node.list_attributes:
value = value.split()
node.attributes[key] = value # node becomes invalid!
# Bookkeeping (register some elements/attributes in document-wide lists)
if isinstance(node, nodes.decoration):
document.decoration = node
elif isinstance(node, nodes.substitution_definition):
document.note_substitution_def(node, ' '.join(node['names']), document)
if node['ids']: # register, check for duplicates
document.set_id(node)
# TODO: anything missing?
# Append content:
# update "unindent" flag: change line indentation?
unindent = unindent and not isinstance(
node, (nodes.FixedTextElement, nodes.literal, Unknown))
# (leading) text
append_text(node, element.text, unindent)
# children and their tailing text
for child in element:
node.append(element2node(child, document, unindent))
# Text after a child node
append_text(node, child.tail, unindent)
return node
def append_text(node, text, unindent) -> None:
# Format `text`, wrap in a TextElement and append to `node`.
# Skip if `text` is empty or just formatting whitespace.
if not text:
return
if unindent:
text = re.sub('\n +', '\n', text)
if isinstance(node, nodes.TextElement):
node.append(nodes.Text(text))
elif text.strip():
# no TextElement: ignore formatting whitespace
# but append other text (node becomes invalid!)
node.append(nodes.Text(text.strip()))