194 lines
		
	
	
	
		
			6.4 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			194 lines
		
	
	
	
		
			6.4 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| #! /usr/bin/env python3
 | |
| # :Copyright: © 2024 Günter Milde.
 | |
| # :License: Released under the terms of the `2-Clause BSD license`_, in short:
 | |
| #
 | |
| #    Copying and distribution of this file, with or without modification,
 | |
| #    are permitted in any medium without royalty provided the copyright
 | |
| #    notice and this notice are preserved.
 | |
| #    This file is offered as-is, without any warranty.
 | |
| #
 | |
| # .. _2-Clause BSD license: https://opensource.org/licenses/BSD-2-Clause
 | |
| #
 | |
| # Revision: $Revision: 10136 $
 | |
| # Date: $Date: 2025-05-20 17:48:27 +0200 (Di, 20. Mai 2025) $
 | |
| 
 | |
| """A Docutils-XML parser.
 | |
| 
 | |
|    Provisional:
 | |
|      The API is not fixed yet.
 | |
|      Defined objects may be renamed or changed
 | |
|      in any Docutils release without prior notice.
 | |
| """
 | |
| 
 | |
| from __future__ import annotations
 | |
| 
 | |
| __docformat__ = 'reStructuredText'
 | |
| 
 | |
| import re
 | |
| import xml.etree.ElementTree as ET
 | |
| 
 | |
| from docutils import frontend, nodes, parsers, utils
 | |
| 
 | |
| 
 | |
| class Parser(parsers.Parser):
 | |
| 
 | |
|     """A Docutils-XML parser."""
 | |
| 
 | |
|     supported = ('xml', 'docutils-xml')
 | |
|     """Aliases this parser supports."""
 | |
| 
 | |
|     config_section = 'xml parser'
 | |
|     config_section_dependencies = ('parsers',)
 | |
|     settings_default_overrides = {'doctitle_xform': False,
 | |
|                                   'validate': True,
 | |
|                                   }
 | |
| 
 | |
|     def parse(self, inputstring, document) -> None:
 | |
|         """
 | |
|         Parse `inputstring` and populate `document`, a "document tree".
 | |
| 
 | |
|         Provisional.
 | |
|         """
 | |
|         self.setup_parse(inputstring, document)
 | |
| 
 | |
|         node = parse_element(inputstring, document)
 | |
|         if not isinstance(node, nodes.document):
 | |
|             document.append(node)
 | |
| 
 | |
|         self.finish_parse()
 | |
| 
 | |
| 
 | |
| class Unknown(nodes.Special, nodes.Inline, nodes.Element):
 | |
|     """An unknown element found by the XML parser."""
 | |
|     content_model = (((nodes.Element, nodes.Text), '*'),)  # no restrictions
 | |
| 
 | |
| 
 | |
| def parse_element(inputstring, document=None):
 | |
|     """
 | |
|     Parse `inputstring` as "Docutils XML", return `nodes.Element` instance.
 | |
| 
 | |
|     :inputstring: XML source.
 | |
|     :document: `nodes.document` instance (default: a new dummy instance).
 | |
|                Provides settings and reporter.
 | |
|                Populated and returned, if the inputstring's root element
 | |
|                is <document>.
 | |
| 
 | |
|     Caution:
 | |
|       The function does not detect invalid XML.
 | |
| 
 | |
|       To check the validity of the returned node,
 | |
|       you may use its `validate()` method::
 | |
| 
 | |
|         node = parse_element('<tip><hint>text</hint></tip>')
 | |
|         node.validate()
 | |
| 
 | |
|     Provisional.
 | |
|     """
 | |
|     root = None
 | |
|     parser = ET.XMLPullParser(events=('start',))
 | |
|     for i, line in enumerate(inputstring.splitlines(keepends=True)):
 | |
|         try:
 | |
|             parser.feed(line)
 | |
|             for event, element in parser.read_events():
 | |
|                 if root is None:
 | |
|                     root = element
 | |
|                 element.attrib['source line'] = str(i+1)
 | |
|         except ET.ParseError as e:
 | |
|             if document is None:
 | |
|                 raise
 | |
|             document.reporter.error(f'XML parse error: {e}.',
 | |
|                                     source=document.settings._source,
 | |
|                                     line=e.position[0])
 | |
|             break
 | |
|     return element2node(root, document)
 | |
| 
 | |
| 
 | |
| def element2node(element, document=None, unindent=True):
 | |
|     """
 | |
|     Convert an `etree` element and its children to Docutils doctree nodes.
 | |
| 
 | |
|     :element:  `xml.etree` element
 | |
|     :document: see `parse_element()`
 | |
|     :unindent: Remove formatting indentation of follow-up text lines?
 | |
|                Cf. `append_text()`.
 | |
|                TODO: do we need an "unindent" configuration setting?
 | |
| 
 | |
|     Return a `docutils.nodes.Element` instance.
 | |
| 
 | |
|     Internal.
 | |
|     """
 | |
|     if document is None:
 | |
|         document = utils.new_document('xml input',
 | |
|                                       frontend.get_default_settings(Parser))
 | |
|         document.source == 'xml input'
 | |
|     if element is None:
 | |
|         problem = nodes.problematic('', 'No XML element found.')
 | |
|         return nodes.paragraph('', '', problem)
 | |
|     # Get the corresponding `nodes.Element` instance:
 | |
|     try:
 | |
|         nodeclass = getattr(nodes, element.tag)
 | |
|         if not issubclass(nodeclass, nodes.Element):
 | |
|             nodeclass = Unknown
 | |
|     except AttributeError:
 | |
|         nodeclass = Unknown
 | |
|     if nodeclass == nodes.document:
 | |
|         node = document
 | |
|         document.source = document.source or document.settings._source
 | |
|     else:
 | |
|         node = nodeclass()
 | |
| 
 | |
|     node.line = int(element.get('source line'))
 | |
|     if isinstance(node, Unknown):
 | |
|         node.tagname = element.tag
 | |
|         document.reporter.warning(
 | |
|             f'Unknown element type <{element.tag}>.',
 | |
|             base_node=node)
 | |
| 
 | |
|     # Attributes: convert and add to `node.attributes`.
 | |
|     for key, value in element.items():
 | |
|         if key.startswith('{') or key == 'source line':
 | |
|             continue  # skip duplicate attributes with namespace URL
 | |
|         try:
 | |
|             node.attributes[key] = nodes.ATTRIBUTE_VALIDATORS[key](value)
 | |
|         except (ValueError, KeyError):
 | |
|             if key in node.list_attributes:
 | |
|                 value = value.split()
 | |
|             node.attributes[key] = value  # node becomes invalid!
 | |
| 
 | |
|     # Bookkeeping (register some elements/attributes in document-wide lists)
 | |
|     if isinstance(node, nodes.decoration):
 | |
|         document.decoration = node
 | |
|     elif isinstance(node, nodes.substitution_definition):
 | |
|         document.note_substitution_def(node, ' '.join(node['names']), document)
 | |
|     if node['ids']:  # register, check for duplicates
 | |
|         document.set_id(node)
 | |
|     # TODO: anything missing?
 | |
| 
 | |
|     # Append content:
 | |
|     # update "unindent" flag: change line indentation?
 | |
|     unindent = unindent and not isinstance(
 | |
|                    node, (nodes.FixedTextElement, nodes.literal, Unknown))
 | |
|     # (leading) text
 | |
|     append_text(node, element.text, unindent)
 | |
|     # children and their tailing text
 | |
|     for child in element:
 | |
|         node.append(element2node(child, document, unindent))
 | |
|         # Text after a child node
 | |
|         append_text(node, child.tail, unindent)
 | |
| 
 | |
|     return node
 | |
| 
 | |
| 
 | |
| def append_text(node, text, unindent) -> None:
 | |
|     # Format `text`, wrap in a TextElement and append to `node`.
 | |
|     # Skip if `text` is empty or just formatting whitespace.
 | |
|     if not text:
 | |
|         return
 | |
|     if unindent:
 | |
|         text = re.sub('\n +', '\n', text)
 | |
|     if isinstance(node, nodes.TextElement):
 | |
|         node.append(nodes.Text(text))
 | |
|     elif text.strip():
 | |
|         # no TextElement: ignore formatting whitespace
 | |
|         # but append other text (node becomes invalid!)
 | |
|         node.append(nodes.Text(text.strip()))
 | 
