193 lines
4.3 KiB
Python
193 lines
4.3 KiB
Python
import re
|
|
import string
|
|
from typing import Any, Dict, Tuple, Union
|
|
|
|
from .util import escape_url
|
|
|
|
PREVENT_BACKSLASH = r"(?<!\\)(?:\\\\)*"
|
|
PUNCTUATION = r"[" + re.escape(string.punctuation) + r"]"
|
|
|
|
LINK_LABEL = r"(?:[^\\\[\]]|\\.){0,500}"
|
|
|
|
LINK_BRACKET_START = re.compile(r"[ \t]*\n?[ \t]*<")
|
|
LINK_BRACKET_RE = re.compile(r"<([^<>\n\\\x00]*)>")
|
|
LINK_HREF_BLOCK_RE = re.compile(r"[ \t]*\n?[ \t]*([^\s]+)(?:\s|$)")
|
|
LINK_HREF_INLINE_RE = re.compile(
|
|
r"[ \t]*\n?[ \t]*([^ \t\n]*?)(?:[ \t\n]|"
|
|
r"(?:" + PREVENT_BACKSLASH + r"\)))"
|
|
)
|
|
|
|
LINK_TITLE_RE = re.compile(
|
|
r"[ \t\n]+("
|
|
r'"(?:\\' + PUNCTUATION + r'|[^"\x00])*"|' # "title"
|
|
r"'(?:\\" + PUNCTUATION + r"|[^'\x00])*'" # 'title'
|
|
r")"
|
|
)
|
|
PAREN_END_RE = re.compile(r"\s*\)")
|
|
|
|
HTML_TAGNAME = r"[A-Za-z][A-Za-z0-9-]*"
|
|
HTML_ATTRIBUTES = (
|
|
r"(?:\s+[A-Za-z_:][A-Za-z0-9_.:-]*"
|
|
r'(?:\s*=\s*(?:[^ !"\'=<>`]+|\'[^\']*?\'|"[^\"]*?"))?)*'
|
|
)
|
|
|
|
BLOCK_TAGS = (
|
|
"address",
|
|
"article",
|
|
"aside",
|
|
"base",
|
|
"basefont",
|
|
"blockquote",
|
|
"body",
|
|
"caption",
|
|
"center",
|
|
"col",
|
|
"colgroup",
|
|
"dd",
|
|
"details",
|
|
"dialog",
|
|
"dir",
|
|
"div",
|
|
"dl",
|
|
"dt",
|
|
"fieldset",
|
|
"figcaption",
|
|
"figure",
|
|
"footer",
|
|
"form",
|
|
"frame",
|
|
"frameset",
|
|
"h1",
|
|
"h2",
|
|
"h3",
|
|
"h4",
|
|
"h5",
|
|
"h6",
|
|
"head",
|
|
"header",
|
|
"hr",
|
|
"html",
|
|
"iframe",
|
|
"legend",
|
|
"li",
|
|
"link",
|
|
"main",
|
|
"menu",
|
|
"menuitem",
|
|
"meta",
|
|
"nav",
|
|
"noframes",
|
|
"ol",
|
|
"optgroup",
|
|
"option",
|
|
"p",
|
|
"param",
|
|
"section",
|
|
"source",
|
|
"summary",
|
|
"table",
|
|
"tbody",
|
|
"td",
|
|
"tfoot",
|
|
"th",
|
|
"thead",
|
|
"title",
|
|
"tr",
|
|
"track",
|
|
"ul",
|
|
)
|
|
PRE_TAGS = ("pre", "script", "style", "textarea")
|
|
|
|
_INLINE_LINK_LABEL_RE = re.compile(LINK_LABEL + r"\]")
|
|
_INLINE_SQUARE_BRACKET_RE = re.compile(PREVENT_BACKSLASH + r"[\[\]]")
|
|
_ESCAPE_CHAR_RE = re.compile(r"\\(" + PUNCTUATION + r")")
|
|
|
|
|
|
def unescape_char(text: str) -> str:
|
|
return _ESCAPE_CHAR_RE.sub(r"\1", text)
|
|
|
|
|
|
def parse_link_text(src: str, pos: int) -> Union[Tuple[str, int], Tuple[None, None]]:
|
|
level = 1
|
|
found = False
|
|
start_pos = pos
|
|
|
|
while pos < len(src):
|
|
m = _INLINE_SQUARE_BRACKET_RE.search(src, pos)
|
|
if not m:
|
|
break
|
|
|
|
pos = m.end()
|
|
marker = m.group(0)
|
|
if marker == "]":
|
|
level -= 1
|
|
if level == 0:
|
|
found = True
|
|
break
|
|
else:
|
|
level += 1
|
|
|
|
if found:
|
|
text = src[start_pos : pos - 1]
|
|
return text, pos
|
|
return None, None
|
|
|
|
|
|
def parse_link_label(src: str, start_pos: int) -> Union[Tuple[str, int], Tuple[None, None]]:
|
|
m = _INLINE_LINK_LABEL_RE.match(src, start_pos)
|
|
if m:
|
|
label = m.group(0)[:-1]
|
|
return label, m.end()
|
|
return None, None
|
|
|
|
|
|
def parse_link_href(src: str, start_pos: int, block: bool = False) -> Union[Tuple[str, int], Tuple[None, None]]:
|
|
m = LINK_BRACKET_START.match(src, start_pos)
|
|
if m:
|
|
start_pos = m.end() - 1
|
|
m = LINK_BRACKET_RE.match(src, start_pos)
|
|
if m:
|
|
return m.group(1), m.end()
|
|
return None, None
|
|
|
|
if block:
|
|
m = LINK_HREF_BLOCK_RE.match(src, start_pos)
|
|
else:
|
|
m = LINK_HREF_INLINE_RE.match(src, start_pos)
|
|
|
|
if not m:
|
|
return None, None
|
|
|
|
end_pos = m.end()
|
|
href = m.group(1)
|
|
|
|
if block and src[end_pos - 1] == href[-1]:
|
|
return href, end_pos
|
|
return href, end_pos - 1
|
|
|
|
|
|
def parse_link_title(src: str, start_pos: int, max_pos: int) -> Union[Tuple[str, int], Tuple[None, None]]:
|
|
m = LINK_TITLE_RE.match(src, start_pos, max_pos)
|
|
if m:
|
|
title = m.group(1)[1:-1]
|
|
title = unescape_char(title)
|
|
return title, m.end()
|
|
return None, None
|
|
|
|
|
|
def parse_link(src: str, pos: int) -> Union[Tuple[Dict[str, Any], int], Tuple[None, None]]:
|
|
href, href_pos = parse_link_href(src, pos)
|
|
if href is None:
|
|
return None, None
|
|
assert href_pos is not None
|
|
title, title_pos = parse_link_title(src, href_pos, len(src))
|
|
next_pos = title_pos or href_pos
|
|
m = PAREN_END_RE.match(src, next_pos)
|
|
if not m:
|
|
return None, None
|
|
|
|
href = unescape_char(href)
|
|
attrs = {"url": escape_url(href)}
|
|
if title:
|
|
attrs["title"] = title
|
|
return attrs, m.end()
|