Coverage for yuio / md.py: 90%
725 statements
« prev ^ index » next coverage.py v7.13.3, created at 2026-02-03 15:42 +0000
« prev ^ index » next coverage.py v7.13.3, created at 2026-02-03 15:42 +0000
1# Yuio project, MIT license.
2#
3# https://github.com/taminomara/yuio/
4#
5# You're free to copy this file to your project and edit it for your needs,
6# just keep this copyright line please :3
8"""
9Parser for Markdown/MyST.
11Yuio supports all CommonMark features except tables. It also supports directives
12and interpreted text via MyST_ syntax.
14**Supported block markup:**
16- headings,
17- numbered and bullet lists,
18- code blocks using backticks and indentation,
19- MyST-style code blocks using colons,
20- code blocks containing MyST directives,
21- quotes,
22- hyperlink targets,
23- thematic breaks.
25**Supported roles:**
27- code:
28 ``code-block``,
29 ``sourcecode``,
30 ``code``;
31- admonitions:
32 ``attention``,
33 ``caution``,
34 ``danger``,
35 ``error``,
36 ``hint``,
37 ``important``,
38 ``note``,
39 ``seealso``,
40 ``tip``,
41 ``warning``;
42- versioning:
43 ``versionadded``,
44 ``versionchanged``,
45 ``deprecated``;
46- any other directive is rendered as un-highlighted code.
48**Supported inline syntax:**
50- emphasis (``*em*``),
51- strong emphasis (``**strong*``),
52- inline code in backticks (```code```),
53- inline math (``$math$``),
54- MyST-style interpreted text (``{role}`content```),
55- hyperlinks (``[text](link)``, ``[text][anchor]``, ``[anchor]``)
56 in terminals that can render them,
57- backslash-escaping.
59**Supported inline roles:**
61- ``flag`` for CLI flags,
62- any other role is interpreted as documentation reference with explicit titles
63 (``{py:class}`title <mod.Class>```) and shortening paths via tilde
64 (``{py:class}`~mod.Class```).
66.. _MyST: https://myst-parser.readthedocs.io/
68.. autofunction:: parse
70.. autoclass:: MdParser
71 :members:
73"""
75from __future__ import annotations
77import dataclasses
78import re
79import string
80from dataclasses import dataclass
82import yuio.doc
83from yuio.util import dedent as _dedent
85from typing import TYPE_CHECKING
87if TYPE_CHECKING:
88 import typing_extensions as _t
89else:
90 from yuio import _typing as _t
92__all__ = [
93 "MdParser",
94 "parse",
95]
98T = _t.TypeVar("T")
101_HEADING_RE = re.compile(
102 r"""
103 ^
104 \s{0,3} # - Initial indent.
105 (?P<marker>\#{1,6}) # - Heading marker.
106 (?P<text>(?:\s.*?)?) # - Heading text. Unless empty, text must be separated
107 # from the heading marker by a space.
108 (?:(?<=\s)\#+)? # - Optional closing hashes. Must be separated from
109 # the previous content by a space. We use lookbehind
110 # here, because if the text is empty, the space
111 # between heading marker and closing hashes will be
112 # matched by the `text` group.
113 \s* # - Closing spaces.
114 $
115 """,
116 re.VERBOSE,
117)
118_SETEXT_HEADING_RE = re.compile(
119 r"""
120 ^
121 (?P<indent>\s{0,3}) # - Initial indent.
122 (?P<level>-|=) # - Heading underline.
123 \2* # - More heading underline.
124 \s* # - Closing spaces.
125 $
126 """,
127 re.VERBOSE,
128)
129_LIST_RE = re.compile(
130 r"""
131 ^
132 (?P<marker>
133 \s{0,3} # - Initial indent.
134 (?P<type>[-*+]) # - List marker.
135 (?:
136 \s(?:\s{0,3}(?=\S))? # - One mandatory and up to three optional spaces;
137 # When there are more than three optional spaces,
138 # we treat them as a list marker followed
139 # by a single space, followed by a code block.
140 | $)) # - For cases when a list starts with an empty line.
141 (?P<text>.*) # - Text of the first line in the list.
142 $
143 """,
144 re.VERBOSE,
145)
146_NUMBERED_LIST_RE = re.compile(
147 r"""
148 ^
149 (?P<marker>
150 \s{0,3} # - Initial indent.
151 (?P<number>\d{1,9}) # - Number.
152 (?P<type>[.:)]) # - Numbered list marker.
153 (?:
154 \s(?:\s{0,3}(?=\S))? # - One mandatory and up to three optional spaces;
155 # When there are more than three optional spaces,
156 # we treat them as a list marker followed
157 # by a single space, followed by a code block.
158 | $)) # - For cases when a list starts with an empty line.
159 (?P<text>.*) # - Text of the first line in the list.
160 $
161 """,
162 re.VERBOSE,
163)
164_CODE_BACKTICK_RE = re.compile(
165 r"""
166 ^
167 (?P<indent>\s{0,3}) # - Initial indent.
168 (?P<fence>```+) # - Backtick fence.
169 (?P<syntax>[^`]*) # - Syntax, can't contain backtick.
170 $
171 """,
172 re.VERBOSE,
173)
174_CODE_TILDE_RE = re.compile(
175 r"""
176 ^
177 (?P<indent>\s{0,3}) # - Initial indent.
178 (?P<fence>~~~+|:::+) # - Backtick fence.
179 (?P<syntax>.*) # - Syntax, can be anything.
180 $
181 """,
182 re.VERBOSE,
183)
184_CODE_FENCE_END_RE = re.compile(
185 r"""
186 ^
187 (?P<indent>\s{0,3}) # - Initial indent.
188 (?P<fence>~~~+|```+|:::+) # - Fence.
189 \s* # - Closing spaces.
190 $
191 """,
192 re.VERBOSE,
193)
194_CODE_RE = re.compile(
195 r"""
196 ^
197 \s{4} # - Initial code indent.
198 (?P<text>.*) # - First code line.
199 $
200 """,
201 re.VERBOSE,
202)
203_QUOTE_RE = re.compile(
204 r"""
205 ^
206 (?P<indent>\s{0,3}) # - Initial quote indent.
207 > # - Quote marker.
208 \s? # - Optional space after the marker.
209 (?P<text>.*) # - Text of the first line in the quote.
210 $
211 """,
212 re.VERBOSE,
213)
214_THEMATIC_BREAK_RE = re.compile(
215 r"""
216 ^
217 (?P<indent>\s{0,3}) # - Initial quote indent.
218 ([-*_])\s*(\2\s*){2,} # - At least three break characters separated by spaces.
219 $
220 """,
221 re.VERBOSE,
222)
223_LINK_ANCHOR_RE = re.compile(
224 r"""
225 ^
226 (?P<indent>\s{0,3}) # - Initial indent.
227 \[ # - Opening marker.
228 (?P<anchor>
229 (?:[^\[\]]|\\.){1,999} # - Link anchor, up to 999 symbols.
230 )
231 \]: # - Closing marker.
232 (?P<href>.*) # - Url. If empty, we look for url on the next line.
233 $
234 """,
235 re.VERBOSE,
236)
237_MYST_DIRECTIVE_NAME_RE = re.compile(
238 r"""
239 ^
240 \{ # - Directive name starts with an opening brace.
241 (?P<directive_name>(?: # - The actual name consists of:
242 [a-zA-Z0-9] # - alphanumerics,
243 | [-_+:,](?![-_+:,]) # - or isolated special characters,
244 )+) # - and it's non-empty.
245 \} # - It ends with a closing brace.
246 (?P<arg>.*) # - Followed by directive arguments.
247 $
248 """,
249 re.VERBOSE,
250)
251_LINE_FEED_RE = re.compile(r"\r\n|\r|\n|\v\r\n|\v\r|\v\n|\v")
254@dataclass(slots=True)
255class _Token:
256 """
257 Token for processing inline markup.
259 """
261 start: int
262 end: int
263 kind: str
265 # Length can decrease as we use up emphasis symbols.
266 len: int = dataclasses.field(init=False)
268 # Emphasis data.
269 can_open: bool = False
270 can_close: bool = False
271 prev_delim: int = -1
272 next_delim: int = -1
274 # Action data.
275 _data: dict[str, _t.Any] | None = dataclasses.field(init=False, default=None)
277 def __post_init__(self):
278 self.len = self.end - self.start
280 @property
281 def data(self):
282 if self._data is None:
283 self._data = {}
284 return self._data
287@dataclass(kw_only=True, slots=True)
288class _Default:
289 pass
292@dataclass(kw_only=True, slots=True)
293class _List:
294 type: str
295 marker_len: int
296 list: yuio.doc.List
297 parser: MdParser
298 number: int | None = None
299 starts_with_empty_line: bool = False
302@dataclass(kw_only=True, slots=True)
303class _Quote:
304 parser: MdParser
307@dataclass(kw_only=True, slots=True)
308class _Code:
309 lines: list[str]
312@dataclass(kw_only=True, slots=True)
313class _FencedCode:
314 indent: int
315 fence_symbol: str
316 fence_length: int
317 syntax: str
318 lines: list[str]
321@dataclass(kw_only=True, slots=True)
322class _Paragraph:
323 lines: list[str]
326@dataclass(kw_only=True, slots=True)
327class _Anchor:
328 anchor: str
331_State: _t.TypeAlias = (
332 _Default | _List | _Quote | _Code | _FencedCode | _Paragraph | _Anchor
333)
336@_t.final
337class MdParser(yuio.doc.DocParser):
338 """
339 Parses subset of CommonMark/MyST.
341 """
343 def __init__(self):
344 self._nodes: list[yuio.doc.AstBase] = []
345 self._state: _State = _Default()
346 self._anchors: dict[str, tuple[str, str]] = {}
348 def _parser(self) -> MdParser:
349 parser = MdParser()
350 parser._anchors = self._anchors
351 return parser
353 @staticmethod
354 def _is_blank(s: str) -> bool:
355 return not s or s.isspace()
357 def parse(self, s: str) -> yuio.doc.Document:
358 s = s.expandtabs(tabsize=4)
359 root = self._do_parse(_LINE_FEED_RE.split(s))
360 yuio.doc._clean_tree(root)
361 self._process_inline_text(root)
362 return root
364 def parse_paragraph(self, s: str, /) -> list[str | yuio.doc.TextRegion]:
365 return _InlineParser(s, {}).run()
367 def _do_parse(self, lines: list[str]):
368 for line in lines:
369 self._handle_line(line)
370 return yuio.doc.Document(items=self._finalize())
372 def _process_inline_text(self, node: yuio.doc.AstBase):
373 if isinstance(node, yuio.doc.Admonition):
374 processor = _InlineParser("\n".join(map(str, node.title)), self._anchors)
375 node.title = processor.run()
376 if isinstance(node, yuio.doc.Text):
377 processor = _InlineParser("\n".join(map(str, node.items)), self._anchors)
378 node.items = processor.run()
379 elif isinstance(node, yuio.doc.Container):
380 for item in node.items:
381 self._process_inline_text(item)
383 def _handle_line(self, line: str):
384 getattr(self, f"_handle_line_{self._state.__class__.__name__.lstrip('_')}")(
385 line
386 )
388 def _handle_lazy_line(self, line: str) -> bool:
389 return getattr(
390 self, f"_handle_lazy_line_{self._state.__class__.__name__.lstrip('_')}"
391 )(line)
393 def _flush(self):
394 getattr(self, f"_flush_{self._state.__class__.__name__.lstrip('_')}")()
396 def _handle_line_List(self, line: str):
397 assert type(self._state) is _List
398 if self._is_blank(line) and self._state.starts_with_empty_line:
399 self._flush_List()
400 self._handle_line_Default(line)
401 elif self._is_blank(line) or line[: self._state.marker_len].isspace():
402 self._state.parser._handle_line(line[self._state.marker_len :])
403 elif (
404 (
405 (match := _LIST_RE.match(line))
406 or (match := _NUMBERED_LIST_RE.match(line))
407 )
408 and match.group("type") == self._state.type
409 and not _THEMATIC_BREAK_RE.match(line)
410 ):
411 item = yuio.doc.ListItem(
412 items=self._state.parser._finalize(),
413 number=self._state.number,
414 )
415 self._state.list.items.append(item)
416 marker = match.group("marker")
417 indent = len(marker)
418 if not marker.endswith(" "):
419 indent += 1
420 self._state.marker_len = indent
421 self._state.parser._handle_line(match.group("text"))
422 if self._state.number is not None:
423 self._state.number += 1
424 elif not self._state.parser._handle_lazy_line(line):
425 self._flush_List()
426 self._handle_line_Default(line)
428 def _handle_lazy_line_List(self, line: str) -> bool:
429 assert type(self._state) is _List
430 if self._state.parser._handle_lazy_line(line):
431 return True
432 return False
434 def _flush_List(self):
435 assert type(self._state) is _List
436 item = yuio.doc.ListItem(
437 items=self._state.parser._finalize(),
438 number=self._state.number,
439 )
440 self._state.list.items.append(item)
441 self._nodes.append(self._state.list)
442 self._state = _Default()
444 def _handle_line_Quote(self, line: str):
445 assert type(self._state) is _Quote
446 if match := _QUOTE_RE.match(line):
447 self._state.parser._handle_line(match.group("text"))
448 elif self._is_blank(line) or not self._state.parser._handle_lazy_line(line):
449 self._flush_Quote()
450 self._handle_line_Default(line)
452 def _handle_lazy_line_Quote(self, line: str) -> bool:
453 assert type(self._state) is _Quote
454 if self._state.parser._handle_lazy_line(line):
455 return True
456 else:
457 return False
459 def _flush_Quote(self):
460 assert type(self._state) is _Quote
461 self._nodes.append(yuio.doc.Quote(items=self._state.parser._finalize()))
462 self._state = _Default()
464 def _handle_line_Code(self, line: str):
465 assert type(self._state) is _Code
466 if self._is_blank(line) or line.startswith(" "):
467 self._state.lines.append(line[4:])
468 else:
469 self._flush_Code()
470 self._handle_line_Default(line)
472 def _handle_lazy_line_Code(self, line: str) -> bool:
473 assert type(self._state) is _Code
474 return False # No lazy continuations for code!
476 def _flush_Code(self):
477 assert type(self._state) is _Code
478 while self._state.lines and self._is_blank(self._state.lines[-1]):
479 self._state.lines.pop()
480 if self._state.lines:
481 self._nodes.append(
482 yuio.doc.Code(
483 lines=self._state.lines,
484 syntax="",
485 )
486 )
487 self._state = _Default()
489 def _handle_line_FencedCode(self, line: str):
490 assert type(self._state) is _FencedCode
491 if (
492 (match := _CODE_FENCE_END_RE.match(line))
493 and match.group("fence")[0] == self._state.fence_symbol
494 and len(match.group("fence")) >= self._state.fence_length
495 ):
496 self._flush_FencedCode()
497 else:
498 if self._state.indent == 0:
499 pass
500 elif line[: self._state.indent].isspace():
501 line = line[self._state.indent :]
502 else:
503 line = line.lstrip()
504 self._state.lines.append(line)
506 def _handle_lazy_line_FencedCode(self, line: str) -> bool:
507 assert type(self._state) is _FencedCode
508 return False
510 def _flush_FencedCode(self):
511 assert type(self._state) is _FencedCode
512 if match := _MYST_DIRECTIVE_NAME_RE.match(self._state.syntax):
513 # This is a MyST directive.
514 first_actual_line = 0
516 # Parse yaml options block.
517 if (
518 first_actual_line < len(self._state.lines)
519 and self._state.lines[first_actual_line] == "---"
520 ):
521 first_actual_line += 1
522 while (
523 first_actual_line < len(self._state.lines)
524 and self._state.lines[first_actual_line] != "---"
525 ):
526 first_actual_line += 1
527 # Parse normal options block.
528 if first_actual_line < len(self._state.lines) and self._state.lines[
529 first_actual_line
530 ].startswith(":"):
531 first_actual_line += 1
532 # Trim empty lines.
533 if (
534 first_actual_line < len(self._state.lines)
535 and not self._state.lines[first_actual_line].strip()
536 ):
537 first_actual_line += 1
538 self._state.lines = self._state.lines[first_actual_line:]
540 name = match.group("directive_name")
541 arg = match.group("arg").strip()
542 else:
543 name = "code-block"
544 arg = self._state.syntax
546 self._nodes.extend(
547 yuio.doc._process_directive(
548 name,
549 arg,
550 lambda: self._state.lines, # type: ignore
551 lambda: self._parser()._do_parse(self._state.lines).items, # type: ignore
552 )
553 )
554 self._state = _Default()
556 def _handle_line_Paragraph(self, line: str):
557 assert type(self._state) is _Paragraph
558 if match := _SETEXT_HEADING_RE.match(line):
559 level = 1 if match.group("level") == "=" else 2
560 self._nodes.append(
561 yuio.doc.Heading(
562 items=_t.cast(list[str | yuio.doc.TextRegion], self._state.lines),
563 level=level,
564 )
565 )
566 self._state = _Default()
567 elif (
568 self._is_blank(line)
569 or _THEMATIC_BREAK_RE.match(line)
570 or _HEADING_RE.match(line)
571 or _CODE_BACKTICK_RE.match(line)
572 or _CODE_TILDE_RE.match(line)
573 or (
574 (match := _LIST_RE.match(line))
575 and not self._is_blank(match.group("text"))
576 )
577 or (
578 (match := _NUMBERED_LIST_RE.match(line))
579 and not self._is_blank(match.group("text"))
580 and match.group("number") == "1"
581 )
582 or _QUOTE_RE.match(line)
583 ):
584 self._flush_Paragraph()
585 self._handle_line_Default(line)
586 else:
587 self._state.lines.append(line)
589 def _handle_lazy_line_Paragraph(self, line: str) -> bool:
590 assert type(self._state) is _Paragraph
591 if (
592 self._is_blank(line)
593 or _THEMATIC_BREAK_RE.match(line)
594 or _HEADING_RE.match(line)
595 or _CODE_BACKTICK_RE.match(line)
596 or _CODE_TILDE_RE.match(line)
597 or _LIST_RE.match(line)
598 or _NUMBERED_LIST_RE.match(line)
599 or _QUOTE_RE.match(line)
600 ):
601 self._flush_Paragraph()
602 return False
603 else:
604 self._state.lines.append(line)
605 return True
607 def _flush_Paragraph(self):
608 assert type(self._state) is _Paragraph
609 self._nodes.append(
610 yuio.doc.Paragraph(
611 items=_t.cast(list[str | yuio.doc.TextRegion], self._state.lines)
612 )
613 )
614 self._state = _Default()
616 def _handle_line_Anchor(self, line: str):
617 assert type(self._state) is _Anchor
618 line = line.strip()
619 if line:
620 url, _ = _InlineParser.parse_link(line)
621 if url:
622 self._anchors.setdefault(self._state.anchor, (line, ""))
623 else:
624 self._nodes.append(yuio.doc.Paragraph(items=[f"[{self._state.anchor}]:"]))
625 self._state = _Default()
627 def _handle_lazy_line_Anchor(self, line: str):
628 assert type(self._state) is _Anchor
629 line = line.strip()
630 if line:
631 url, _ = _InlineParser.parse_link(line)
632 if url:
633 self._anchors.setdefault(self._state.anchor, (line, ""))
634 self._state = _Default()
635 return True
636 else:
637 self._nodes.append(yuio.doc.Paragraph(items=[f"[{self._state.anchor}]:"]))
638 self._state = _Default()
639 return False
641 def _flush_Anchor(self):
642 assert type(self._state) is _Anchor
643 self._state = _Default()
645 def _handle_line_Default(self, line: str):
646 assert type(self._state) is _Default
647 if self._is_blank(line):
648 pass # do nothing
649 elif match := _LINK_ANCHOR_RE.match(line):
650 anchor = match.group("anchor").strip()
651 href = match.group("href").strip()
652 if not anchor:
653 self._state = _Paragraph(lines=[line])
654 elif href:
655 url, _ = _InlineParser.parse_link(href)
656 if url is not None:
657 anchor = _InlineParser.norm_anchor(anchor)
658 self._anchors.setdefault(anchor, (url, ""))
659 else:
660 self._state = _Paragraph(lines=[line])
661 else:
662 anchor = _InlineParser.norm_anchor(anchor)
663 self._state = _Anchor(anchor=anchor)
664 elif _THEMATIC_BREAK_RE.match(line):
665 self._nodes.append(yuio.doc.ThematicBreak())
666 elif match := _HEADING_RE.match(line):
667 level = len(match.group("marker"))
668 self._nodes.append(
669 yuio.doc.Heading(
670 items=[match.group("text").strip()],
671 level=level,
672 )
673 )
674 elif (match := _CODE_BACKTICK_RE.match(line)) or (
675 match := _CODE_TILDE_RE.match(line)
676 ):
677 indent = len(match.group("indent"))
678 syntax = match.group("syntax").strip()
679 fence_symbol = match.group("fence")[0]
680 fence_length = len(match.group("fence"))
681 self._state = _FencedCode(
682 indent=indent,
683 fence_symbol=fence_symbol,
684 fence_length=fence_length,
685 syntax=syntax,
686 lines=[],
687 )
688 elif match := _CODE_RE.match(line):
689 self._state = _Code(lines=[match.group("text")])
690 elif (match := _LIST_RE.match(line)) or (
691 match := _NUMBERED_LIST_RE.match(line)
692 ):
693 marker = match.group("marker")
694 indent = len(marker)
695 if not marker.endswith(" "):
696 indent += 1
697 list_type = match.group("type")
698 number_str = match.groupdict().get("number", None)
699 number = int(number_str) if number_str else None
700 starts_with_empty_line = self._is_blank(match.group("text"))
701 self._state = _List(
702 type=list_type,
703 marker_len=indent,
704 list=yuio.doc.List(
705 items=[],
706 enumerator_kind=(
707 yuio.doc.ListEnumeratorKind.NUMBER
708 if number is not None
709 else None
710 ),
711 ),
712 parser=self._parser(),
713 number=number,
714 starts_with_empty_line=starts_with_empty_line,
715 )
716 self._state.parser._handle_line(match.group("text"))
717 elif match := _QUOTE_RE.match(line):
718 self._state = _Quote(parser=self._parser())
719 self._state.parser._handle_line(match.group("text"))
720 else:
721 self._state = _Paragraph(lines=[line])
723 def _handle_lazy_line_Default(self, line: str) -> bool:
724 assert type(self._state) is _Default
725 return False
727 def _flush_Default(self):
728 assert type(self._state) is _Default
730 def _finalize(self) -> list[yuio.doc.AstBase]:
731 self._flush()
732 result = self._nodes
733 self._nodes = []
734 return result
737_UNESCAPE_RE = re.compile(rf"\\([{re.escape(string.punctuation)}])")
740class _InlineParser:
741 # Based on https://spec.commonmark.org/0.31.2/#phase-2-inline-structure
743 def __init__(self, text: str, anchors: dict[str, tuple[str, str]]) -> None:
744 self._text = text
745 self._pos = 0
746 self._anchors = anchors
747 self._tokens: list[_Token] = []
748 self._link_opener_indices: list[int] = []
749 self._delim_first = -1
750 self._delim_last = -1
752 @staticmethod
753 def norm_anchor(anchor: str) -> str:
754 return re.sub(r"\s+", " ", anchor.strip()).casefold()
756 @staticmethod
757 def unescape(text: str) -> str:
758 return _UNESCAPE_RE.sub(r"\1", text)
760 def run(self) -> list[str | yuio.doc.TextRegion]:
761 while self._fits(self._pos):
762 self._run()
763 self._process_delims()
765 res = yuio.doc.TextRegion()
766 stack = [res]
768 em = 0
769 strong = 0
771 def add_text(text: str | yuio.doc.TextRegion):
772 if not text:
773 return
774 colors = []
775 if em:
776 colors.append("em")
777 if strong:
778 colors.append("strong")
779 if colors:
780 text = yuio.doc.HighlightedRegion(text, color=" ".join(colors))
781 stack[-1].content.append(text)
783 for token in self._tokens:
784 match token.kind:
785 case "text":
786 text = self._text[token.start : token.start + token.len]
787 add_text(text)
788 case "*" | "_":
789 em += token.data.get("em", 0)
790 strong += token.data.get("strong", 0)
791 text = self._text[token.start : token.start + token.len]
792 add_text(text)
793 case "link_start":
794 if (url := token.data.get("url")) is not None:
795 stack.append(yuio.doc.LinkRegion(url=url))
796 else:
797 text = self._text[token.start : token.start + token.len]
798 add_text(text)
799 case "link_end":
800 assert len(stack) > 1
801 top = stack.pop()
802 stack[-1].content.append(top)
803 case "escape":
804 text = self._text[token.start : token.start + token.len]
805 if text == "\n":
806 text = (
807 "\v\n" # Vertical tab forces wrapper to make a line break.
808 )
809 elif not text or text not in string.punctuation:
810 text = "\\" + text
811 add_text(text)
812 case "formatted":
813 add_text(token.data["content"])
814 case kind:
815 assert False, kind
817 return res.content
819 @classmethod
820 def parse_link(cls, link: str):
821 return cls(link + ")", {})._parse_link()
823 def _fits(self, i):
824 return i < len(self._text)
826 def _ch_eq(self, i, cs):
827 return self._fits(i) and self._text[i] in cs
829 def _ch_in(self, i, cs):
830 return self._fits(i) and self._text[i] in cs
832 def _ch_at(self, i):
833 if 0 <= i < len(self._text):
834 return self._text[i]
835 else:
836 return " "
838 def _eat(self, ch):
839 start = self._pos
840 while self._pos < len(self._text) and self._text[self._pos] == ch:
841 self._pos += 1
842 return self._pos - start
844 def _eat_in(self, ch):
845 start = self._pos
846 while self._pos < len(self._text) and self._text[self._pos] in ch:
847 self._pos += 1
848 return self._pos - start
850 def _eat_not_in(self, ch):
851 start = self._pos
852 while self._pos < len(self._text) and self._text[self._pos] not in ch:
853 self._pos += 1
854 return self._pos - start
856 def _run(self):
857 match self._text[self._pos]:
858 case "\\":
859 self._tokens.append(_Token(self._pos + 1, self._pos + 2, "escape"))
860 self._pos += 2
861 case "`":
862 self._parse_code()
863 case "$":
864 self._parse_math()
865 case "{":
866 self._parse_role()
867 case "!" if self._ch_eq(self._pos + 1, "["):
868 self._push_link_start("image_start", 2)
869 case "[":
870 self._push_link_start("link_start", 1)
871 case "]":
872 self._parse_link_end()
873 case "*" | "_":
874 self._parse_delim_run()
875 case "!" | "\\":
876 self._tokens.append(_Token(self._pos, self._pos + 1, "text"))
877 self._pos += 1
878 case _:
879 start = self._pos
880 self._eat_not_in("\\`[]!*_{$")
881 self._tokens.append(_Token(start, self._pos, "text"))
883 def _parse_role(self):
884 start = self._pos
885 self._pos += 1
886 # alphanumerics plus isolated internal hyphens, underscores, plus signs, colons, and periods
888 while self._fits(self._pos):
889 match self._text[self._pos]:
890 case "}":
891 self._pos += 1
892 break
893 case ch if ch.isalnum():
894 self._pos += 1
895 case ch if ch in "-_+:," and not self._ch_in(self._pos + 1, "-_+:,"):
896 self._pos += 1
897 case _:
898 self._pos = start + 1
899 self._tokens.append(_Token(self._pos, self._pos + 1, "text"))
900 return
901 if self._ch_eq(self._pos, "`"):
902 role = self._text[start + 1 : self._pos - 1]
903 self._parse_code(role)
905 def _parse_code(self, role: str | None = None):
906 start = self._pos
907 n_backticks = self._eat("`")
909 end = None
910 while self._fits(self._pos):
911 if self._text[self._pos] == "`":
912 n_backticks_end = self._eat("`")
913 if n_backticks == n_backticks_end:
914 end = self._pos
915 break
916 else:
917 self._pos += 1
919 if end is None:
920 self._tokens.append(_Token(start, start + n_backticks, "text"))
921 self._pos = start + n_backticks
922 else:
923 code = self._text[start + n_backticks : end - n_backticks]
924 if (
925 code.startswith((" ", "\n"))
926 and code.endswith((" ", "\n"))
927 and len(code) > 2
928 ):
929 code = code[1:-1]
930 start += 1
931 end -= 1
932 token = _Token(start + n_backticks, end - n_backticks, "formatted")
933 token.data["content"] = yuio.doc._process_role(code, role or "code")
934 self._tokens.append(token)
936 def _parse_math(self):
937 start = self._pos
938 n_markers = self._eat("$")
939 if n_markers > 2:
940 self._tokens.append(_Token(start, self._pos, "text"))
941 return
943 end = None
944 while self._fits(self._pos):
945 if self._text[self._pos] == "$":
946 n_markers_end = self._eat("$")
947 if n_markers == n_markers_end:
948 end = self._pos
949 break
950 else:
951 self._pos += 1
953 if end is None:
954 self._tokens.append(_Token(start, start + n_markers, "text"))
955 self._pos = start + n_markers
956 else:
957 code = self._text[start + n_markers : end - n_markers]
958 token = _Token(start + n_markers, end - n_markers, "formatted")
959 token.data["content"] = yuio.doc._process_role(code, "math")
960 self._tokens.append(token)
962 def _push_link_start(self, kind, length):
963 self._link_opener_indices.append(len(self._tokens))
964 self._tokens.append(
965 _Token(
966 self._pos,
967 self._pos + length,
968 kind,
969 )
970 )
971 self._pos += length
973 def _parse_link_end(self):
974 if not self._link_opener_indices:
975 # No corresponding link opener.
976 self._tokens.append(_Token(self._pos, self._pos + 1, "text"))
977 self._pos += 1
978 return
979 opener_token_idx = self._link_opener_indices.pop()
980 opener_token = self._tokens[opener_token_idx]
981 assert opener_token.kind in ["link_start", "image_start"]
983 start = self._pos
984 self._pos += 1
986 if self._ch_eq(self._pos, "("):
987 self._pos += 1
988 url, title = self._parse_link()
989 else:
990 if self._ch_eq(self._pos, "["):
991 self._pos += 1
992 anchor = self._parse_anchor()
993 else:
994 anchor = self._text[opener_token.end : self._pos - 1]
995 if anchor:
996 url, title = self._anchors.get(self.norm_anchor(anchor), (None, None))
997 else:
998 url, title = None, None
1000 if url is None:
1001 self._tokens.append(_Token(start, start + 1, "text"))
1002 self._pos = start + 1
1003 return
1005 if opener_token.kind == "link_start":
1006 close_token = _Token(start, self._pos, "link_end")
1007 self._link_opener_indices.clear() # Prevent nested links.
1008 else:
1009 close_token = _Token(start, self._pos, "image_end")
1010 opener_token.data["url"] = url
1011 opener_token.data["title"] = title
1012 opener_token.len = 0
1013 close_token.data["url"] = None
1014 close_token.data["title"] = None
1015 close_token.len = 0
1016 self._tokens.append(close_token)
1017 self._process_delims(opener_token_idx)
1019 def _parse_link(self):
1020 if self._ch_eq(self._pos, "<"):
1021 self._pos += 1
1022 url = self._parse_href_angled()
1023 else:
1024 url = self._parse_href_bare()
1025 if url is None:
1026 return None, None # Href parsing failed.
1027 if self._ch_in(self._pos, " )"):
1028 title = self._parse_title()
1029 if title is None:
1030 return None, None # Title parsing failed.
1031 else:
1032 url = self.unescape(url) # Normal escaping rules apply.
1033 return url, title
1034 else:
1035 return None, None # Href does not end with expected symbol.
1037 def _parse_href_angled(self):
1038 start = self._pos
1039 while self._fits(self._pos):
1040 match self._text[self._pos]:
1041 case "\\" if self._ch_in(self._pos + 1, string.punctuation):
1042 self._pos += 2
1043 case ">":
1044 self._pos += 1
1045 return self._text[start : self._pos - 1]
1046 case "<" | "\n":
1047 break
1048 case _:
1049 self._pos += 1
1050 return None
1052 def _parse_href_bare(self):
1053 start = self._pos
1054 paren_level = 1
1055 url = None
1056 while self._fits(self._pos):
1057 match self._text[self._pos]:
1058 case "\\" if self._ch_in(self._pos + 1, string.punctuation):
1059 self._pos += 2
1060 case ch if 0x00 <= ord(ch) <= 0x1F:
1061 break
1062 case "\x7f":
1063 break
1064 case " ":
1065 url = self._text[start : self._pos]
1066 break
1067 case "(":
1068 paren_level += 1
1069 self._pos += 1
1070 case ")":
1071 paren_level -= 1
1072 if paren_level == 0:
1073 url = self._text[start : self._pos]
1074 break
1075 else:
1076 self._pos += 1
1077 case _:
1078 self._pos += 1
1079 if not url:
1080 # Empty url is not allowed in this case.
1081 url = None
1082 return url
1084 def _parse_title(self):
1085 self._eat(" ")
1086 if self._ch_eq(self._pos, ")"):
1087 self._pos += 1
1088 return "" # Empty title is ok.
1089 elif self._ch_eq(self._pos, "'"):
1090 self._pos += 1
1091 end_char = "'"
1092 elif self._ch_eq(self._pos, '"'):
1093 self._pos += 1
1094 end_char = '"'
1095 elif self._ch_eq(self._pos, "("):
1096 self._pos += 1
1097 end_char = ")"
1098 else:
1099 return None # Title parsing failed.
1100 start = self._pos
1101 title = None
1102 while self._fits(self._pos):
1103 match self._text[self._pos]:
1104 case "\\" if self._ch_in(self._pos + 1, string.punctuation):
1105 self._pos += 2
1106 case ch if ch == end_char:
1107 title = self._text[start : self._pos]
1108 self._pos += 1
1109 break
1110 case _:
1111 self._pos += 1
1112 if self._ch_eq(self._pos, ")"):
1113 self._pos += 1
1114 else:
1115 return None # Href does not end with expected symbol.
1116 return title
1118 def _parse_anchor(self):
1119 start = self._pos
1120 while self._fits(self._pos):
1121 match self._text[self._pos]:
1122 case "\\" if self._ch_in(self._pos + 1, string.punctuation):
1123 self._pos += 2
1124 case "]":
1125 self._pos += 1
1126 return self._text[start : self._pos - 1]
1127 case _:
1128 self._pos += 1
1129 return None
1131 def _parse_delim_run(self):
1132 start = self._pos
1133 ch = self._text[self._pos]
1134 self._eat(ch)
1136 char_before = self._ch_at(start - 1)
1137 char_after = self._ch_at(self._pos)
1139 left_flanking = not char_after.isspace() and (
1140 char_after not in string.punctuation
1141 or char_before.isspace()
1142 or char_before in string.punctuation
1143 )
1145 right_flanking = not char_before.isspace() and (
1146 char_before not in string.punctuation
1147 or char_after.isspace()
1148 or char_after in string.punctuation
1149 )
1151 if ch == "*":
1152 can_open = left_flanking
1153 can_close = right_flanking
1154 else: # "_"
1155 can_open = left_flanking and (
1156 not right_flanking or (char_before in string.punctuation)
1157 )
1158 can_close = right_flanking and (
1159 not left_flanking or (char_after in string.punctuation)
1160 )
1162 if can_open or can_close:
1163 self._tokens.append(
1164 _Token(start, self._pos, ch, can_open=can_open, can_close=can_close)
1165 )
1166 self._push_delim(-1)
1167 else:
1168 self._tokens.append(_Token(start, self._pos, "text"))
1170 def _push_delim(self, idx: int):
1171 if idx == -1:
1172 idx += len(self._tokens)
1173 assert idx >= 0
1174 assert self._tokens[idx].kind in "*_"
1175 assert self._tokens[idx].prev_delim == -1
1176 assert self._tokens[idx].next_delim == -1
1178 if self._delim_last == -1:
1179 self._delim_last = self._delim_first = idx
1180 else:
1181 self._tokens[self._delim_last].next_delim = idx
1182 self._tokens[idx].prev_delim = self._delim_last
1183 self._delim_last = idx
1185 def _remove_delim(self, idx: int):
1186 tok = self._tokens[idx]
1187 if tok.prev_delim == -1:
1188 self._delim_first = tok.next_delim
1189 else:
1190 self._tokens[tok.prev_delim].next_delim = tok.next_delim
1191 if tok.next_delim == -1:
1192 self._delim_last = tok.prev_delim
1193 else:
1194 self._tokens[tok.next_delim].prev_delim = tok.prev_delim
1196 def _next_delim(self, idx: int):
1197 if idx == -1:
1198 return self._delim_first
1199 else:
1200 return self._tokens[idx].next_delim
1202 def _prev_delim(self, idx: int):
1203 if idx == -1:
1204 return self._delim_last
1205 else:
1206 return self._tokens[idx].prev_delim
1208 def _process_delims(self, first_delim: int = -1):
1209 if first_delim == -1:
1210 bottom_idx = -1
1211 else:
1212 for i in range(first_delim, len(self._tokens)):
1213 if self._tokens[i].kind in "*_":
1214 bottom_idx = self._prev_delim(i)
1215 break
1216 else:
1217 bottom_idx = -1
1219 openers_bottom_idxs = {
1220 ("*", 0, False): bottom_idx,
1221 ("*", 1, False): bottom_idx,
1222 ("*", 2, False): bottom_idx,
1223 ("*", 0, True): bottom_idx,
1224 ("*", 1, True): bottom_idx,
1225 ("*", 2, True): bottom_idx,
1226 ("_", 0, False): bottom_idx,
1227 ("_", 1, False): bottom_idx,
1228 ("_", 2, False): bottom_idx,
1229 ("_", 0, True): bottom_idx,
1230 ("_", 1, True): bottom_idx,
1231 ("_", 2, True): bottom_idx,
1232 }
1234 current_idx = self._next_delim(bottom_idx)
1235 while True:
1236 while current_idx != -1 and not self._tokens[current_idx].can_close:
1237 current_idx = self._next_delim(current_idx)
1238 if current_idx == -1:
1239 break
1240 # Current is a potential closer, find a matching opener for it.
1241 current = self._tokens[current_idx]
1242 bottom_idx_for_current = max(
1243 bottom_idx,
1244 openers_bottom_idxs[(current.kind, current.len % 3, current.can_open)],
1245 )
1247 opener_idx = self._prev_delim(current_idx)
1248 while opener_idx > bottom_idx_for_current:
1249 opener = self._tokens[opener_idx]
1251 # "If one of the delimiters can both open and close emphasis,
1252 # then the sum of the lengths of the delimiter runs containing
1253 # the opening and closing delimiters must not be a multiple
1254 # of 3 unless both lengths are multiples of 3."
1255 #
1256 # See https://spec.commonmark.org/0.31.2/#emphasis-and-strong-emphasis.
1257 if (
1258 opener.can_open
1259 and opener.kind == current.kind
1260 and (
1261 # None or the delimiters can open and close at the same time...
1262 not (opener.can_close or current.can_open)
1263 # ...or sum of their lengths is not a multiple of 3...
1264 or (opener.len + current.len) % 3 != 0
1265 # ...or both lengths are multiples of 3.
1266 or not (opener.len % 3 != 0 or current.len % 3 != 0)
1267 )
1268 ):
1269 # Found an opener for current.
1270 is_strong = opener.len >= 2 and current.len >= 2
1272 data_key = "strong" if is_strong else "em"
1273 opener.data.setdefault(data_key, 0)
1274 opener.data[data_key] += 1
1275 current.data.setdefault(data_key, 0)
1276 current.data[data_key] -= 1
1278 opener.next_delim = current_idx
1279 current.prev_delim = opener_idx
1281 opener.len -= 1 + is_strong
1282 if not opener.len:
1283 self._remove_delim(opener_idx)
1285 current.len -= 1 + is_strong
1286 next_idx = current_idx
1287 if not current.len:
1288 next_idx = self._next_delim(current_idx)
1289 self._remove_delim(current_idx)
1291 current_idx = next_idx
1293 break
1294 else:
1295 opener_idx = self._prev_delim(opener_idx)
1296 else:
1297 # No opener for current.
1298 openers_bottom_idxs[
1299 (current.kind, current.len % 3, current.can_open)
1300 ] = self._prev_delim(current_idx)
1301 next_idx = self._next_delim(current_idx)
1302 if not current.can_open:
1303 self._remove_delim(current_idx)
1304 current_idx = next_idx
1307def parse(text: str, /, *, dedent: bool = True) -> yuio.doc.Document:
1308 """
1309 Parse a markdown document and return an AST node.
1311 :param text:
1312 text to parse. Common indentation will be removed from this string,
1313 making it suitable to use with triple quote literals.
1314 :param dedent:
1315 remove lading indent from `text`.
1316 :returns:
1317 parsed AST node.
1319 """
1321 if dedent:
1322 text = _dedent(text)
1324 return MdParser().parse(text)