Coverage for yuio / rst.py: 97%
948 statements
« prev ^ index » next coverage.py v7.13.3, created at 2026-02-03 15:42 +0000
« prev ^ index » next coverage.py v7.13.3, created at 2026-02-03 15:42 +0000
1# Yuio project, MIT license.
2#
3# https://github.com/taminomara/yuio/
4#
5# You're free to copy this file to your project and edit it for your needs,
6# just keep this copyright line please :3
8"""
9Parser for ReStructuredText.
11Yuio supports all RST features except tables and option lists.
13**Supported block markup:**
15- headings,
16- numbered and bullet lists,
17- definition lists,
18- field lists,
19- literal blocks, both indented and quoted,
20- line blocks,
21- quotes,
22- doctest blocks,
23- directives,
24- hyperlink targets,
25- footnotes,
26- thematic breaks.
28**Supported roles:**
30- code:
31 ``code-block``,
32 ``sourcecode``,
33 ``code``;
34- admonitions:
35 ``attention``,
36 ``caution``,
37 ``danger``,
38 ``error``,
39 ``hint``,
40 ``important``,
41 ``note``,
42 ``seealso``,
43 ``tip``,
44 ``warning``;
45- versioning:
46 ``versionadded``,
47 ``versionchanged``,
48 ``deprecated``;
49- any other directive is rendered as un-highlighted code.
51**Supported inline syntax:**
53- emphasis (``*em*``),
54- strong emphasis (``**strong*``),
55- inline code in backticks (```code```),
56- interpreted text (```code```, ``:role:`code```),
57- hyperlink references (```text`_``, ``text_``, ```text`__``, ``text__``)
58 in terminals that can render them,
59- footnotes (``[...]_``),
60- inline internal targets and substitution references are parsed correctly,
61 but they have no effect.
63**Supported inline roles:**
65- ``flag`` for CLI flags,
66- any other role is interpreted as documentation reference with explicit titles
67 (``{py:class}`title <mod.Class>```) and shortening paths via tilde
68 (``{py:class}`~mod.Class```).
70.. autofunction:: parse
72.. autoclass:: RstParser
73 :members:
75"""
77from __future__ import annotations
79import dataclasses
80import re
81import string
82from dataclasses import dataclass
83from enum import Enum
85import yuio.doc
86from yuio.util import dedent as _dedent
88import yuio._typing_ext as _tx
89from typing import TYPE_CHECKING
91if TYPE_CHECKING:
92 import typing_extensions as _t
93else:
94 from yuio import _typing as _t
96__all__ = [
97 "RstParser",
98 "parse",
99]
102class _LineEnding(Enum):
103 NORMAL = "NORMAL"
104 LITERAL_MARK = "LITERAL_MARK" # Line ends with double colon
107_LINE_BLOCK_START_RE = re.compile(
108 r"""
109 ^
110 (?P<indent>
111 (?P<open_marker>\|)
112 (?P<space>\s+|$)
113 )
114 (?P<tail>.*)
115 """,
116 re.VERBOSE,
117)
120_BULLET_LIST_START_RE = re.compile(
121 r"""
122 ^
123 (?P<indent>
124 (?P<enumerator>[*+•‣⁃-])
125 (?P<space>\s+|$)
126 )
127 (?P<tail>.*)
128 $
129 """,
130 re.VERBOSE,
131)
134_NUM_LIST_START_RE = re.compile(
135 r"""
136 ^
137 (?P<indent>
138 (?P<open_marker>\(?)
139 (?P<enumerator>
140 (?P<enumerator_num>\d+)
141 | (?P<enumerator_auto>\#)
142 | (?P<enumerator_lowercase>[a-z]+)
143 | (?P<enumerator_uppercase>[A-Z]+)
144 )
145 (?P<close_marker>[).])
146 (?P<space>\s+|$)
147 )
148 (?P<tail>.*)
149 $
150 """,
151 re.VERBOSE,
152)
155_EXPLICIT_MARKUP_START_RE = re.compile(
156 r"""
157 ^
158 (?P<indent>
159 (?P<open_marker>\.\.)
160 (?P<space>\s+|$)
161 )
162 (?P<tail>.*)
163 $
164 """,
165 re.VERBOSE,
166)
168_IMPLICIT_HYPERLINK_TARGET_RE = re.compile(
169 r"""
170 ^
171 (?P<indent>
172 (?P<open_marker>__)
173 (?P<space>\s+|$)
174 )
175 (?P<tail>.*)
176 $
177 """,
178 re.VERBOSE,
179)
181_FIELD_START_RE = re.compile(
182 r"""
183 ^
184 (?P<indent>
185 (?P<open_marker>:)
186 (?P<content>(?:[^:\\]|\\.|:(?!\s|`))+)
187 (?P<close_marker>:)
188 (?P<space>\s+|$)
189 )
190 (?P<tail>.*)
191 $
192 """,
193 re.VERBOSE,
194)
197_PUNCT = tuple(string.punctuation)
200@dataclass(slots=True)
201class _Hyperlink:
202 start: int
203 end: int
204 name: str
205 type: _t.Literal["link", "footnote", "redirect"]
206 content: str
209class _LinkResolver:
210 def __init__(
211 self,
212 targets: dict[str, _Hyperlink],
213 anonymous_links: list[_Hyperlink],
214 auto_numbered_footnotes: list[str] = [],
215 auto_character_footnotes: list[str] = [],
216 ) -> None:
217 self._targets: dict[str, _Hyperlink] = targets
219 self._anonymous_links: list[_Hyperlink] = anonymous_links
220 self._current_anonymous_link = 0
222 self._auto_numbered_footnotes: list[str] = auto_numbered_footnotes
223 self._current_auto_numbered_footnote = 0
224 self._auto_character_footnotes: list[str] = auto_character_footnotes
225 self._current_auto_character_footnote = 0
227 def find_link(self, title: str, target: str | None, is_anonymous: bool):
228 if target:
229 # Process explicit target.
230 target, is_redirect = _normalize_hyperlink_target(target)
231 if is_redirect:
232 link = self._resolve_redirect(target)
233 else:
234 link = _Hyperlink(0, 0, title, "link", target)
235 if link and not is_anonymous:
236 # Save implicitly declared anchor.
237 anchor = _normalize_hyperlink_anchor(title)
238 self._targets.setdefault(anchor, link)
239 elif is_anonymous:
240 link = self._next_anonymous_link()
241 else:
242 anchor = _normalize_hyperlink_anchor(title)
243 if anchor.startswith("#"):
244 anchor = anchor[1:]
245 if not anchor:
246 anchor = self._next_auto_numbered_footnote() or ""
247 elif anchor.startswith("*"):
248 anchor = anchor[1:]
249 if not anchor:
250 anchor = self._next_auto_character_footnote() or ""
251 if not anchor:
252 return None
253 link = self._targets.get(anchor)
254 if link and link.type == "redirect":
255 link = self._resolve_redirect(link.content)
256 if not link or not link.content:
257 return None
258 else:
259 return link
261 def _next_anonymous_link(self):
262 if self._current_anonymous_link >= len(self._anonymous_links):
263 return None
264 link = self._anonymous_links[self._current_anonymous_link]
265 self._current_anonymous_link += 1
266 return link
268 def _next_auto_numbered_footnote(self):
269 if self._current_auto_numbered_footnote >= len(self._auto_numbered_footnotes):
270 return None
271 link = self._auto_numbered_footnotes[self._current_auto_numbered_footnote]
272 self._current_auto_numbered_footnote += 1
273 return link
275 def _next_auto_character_footnote(self):
276 if self._current_auto_character_footnote >= len(self._auto_character_footnotes):
277 return None
278 link = self._auto_character_footnotes[self._current_auto_character_footnote]
279 self._current_auto_character_footnote += 1
280 return link
282 def _resolve_redirect(self, target: str):
283 seen = set()
284 while target not in seen:
285 seen.add(target)
286 link = self._targets.get(target)
287 if link and link.type == "redirect":
288 target = link.content
289 elif link:
290 return link
291 return None
294_FOOTNOTE_CHARS = "*†‡§¶#♠♥♦♣"
297def _char_footnote(n: int, /) -> str:
298 assert n > 0
299 n_chars = len(_FOOTNOTE_CHARS)
300 result = ""
301 while n > 0:
302 n -= 1
303 result = _FOOTNOTE_CHARS[n % n_chars] + result
304 n //= n_chars
305 return result
308@_t.final
309class RstParser(yuio.doc.DocParser):
310 """
311 Parses subset of CommonMark/MyST.
313 """
315 def parse(self, s: str, /) -> yuio.doc.Document:
316 self._lines = s.expandtabs(tabsize=4).splitlines(keepends=False)
317 self._headings: dict[tuple[str, bool], int] = {}
318 self._links: list[_Hyperlink] = []
319 self._anonymous_links: list[_Hyperlink] = []
320 self._targets: dict[str, _Hyperlink] = {}
321 self._last_numbered_footnote = 1
322 self._last_character_footnote = 1
323 self._auto_numbered_footnotes: list[str] = []
324 self._auto_character_footnotes: list[str] = []
326 root = yuio.doc.Document(items=[])
327 self._process_block(root, 0, len(self._lines))
328 link_resolver = _LinkResolver(
329 self._targets,
330 self._anonymous_links,
331 self._auto_numbered_footnotes,
332 self._auto_character_footnotes,
333 )
334 yuio.doc._clean_tree(root)
335 self._process_inline_text(root, link_resolver)
336 return root
338 def parse_paragraph(self, s: str, /) -> list[str | yuio.doc.TextRegion]:
339 return _InlineParser(s, _LinkResolver({}, [], [], [])).run()
341 def _process_inline_text(
342 self, node: yuio.doc.AstBase, link_resolver: _LinkResolver
343 ):
344 if isinstance(node, yuio.doc.Admonition):
345 processor = _InlineParser("\n".join(map(str, node.title)), link_resolver)
346 node.title = processor.run()
347 if isinstance(node, yuio.doc.Text):
348 processor = _InlineParser("\n".join(map(str, node.items)), link_resolver)
349 node.items = processor.run()
350 elif isinstance(node, yuio.doc.Container):
351 for item in node.items:
352 self._process_inline_text(item, link_resolver)
354 def _process_block(self, parent: yuio.doc.Container[_t.Any], start: int, end: int):
355 i = start
356 prev_line_ending = _LineEnding.NORMAL
358 while i < end:
359 i, prev_line_ending = self._consume_block(parent, i, end, prev_line_ending)
361 return parent
363 def _consume_block(
364 self,
365 parent: yuio.doc.Container[_t.Any],
366 start: int,
367 end: int,
368 prev_line_ending: _LineEnding,
369 ) -> tuple[int, _LineEnding]:
370 if start >= end: # pragma: no cover
371 return start, prev_line_ending
373 line = self._lines[start]
375 if _is_blank(line):
376 return start + 1, prev_line_ending
378 result = None
380 if prev_line_ending == _LineEnding.LITERAL_MARK and (
381 line.startswith(" ") or line.startswith(_PUNCT)
382 ):
383 result = self._try_process_literal_text(parent, start, end)
384 elif _is_heading_underline(self._lines, start, end):
385 self._process_title(parent, line, self._lines[start + 1][0], False)
386 result = start + 2
387 elif _is_heading_overline(self._lines, start, end):
388 self._process_title(parent, self._lines[start + 1], line[0], True)
389 result = start + 3
390 elif line.startswith(">>>"):
391 result = self._process_doctest_block(parent, start, end)
392 elif line.startswith(" "):
393 result = self._process_block_quote(parent, start, end)
394 elif match := _LINE_BLOCK_START_RE.match(line):
395 result = self._process_line_block(parent, start, end, match)
396 elif match := _BULLET_LIST_START_RE.match(line):
397 result = self._process_bullet_list(parent, start, end, match)
398 elif match := _NUM_LIST_START_RE.match(line):
399 result = self._try_process_numbered_list(parent, start, end, match)
400 elif match := _EXPLICIT_MARKUP_START_RE.match(line):
401 result = self._try_process_explicit_markup(parent, start, end, match)
402 elif match := _IMPLICIT_HYPERLINK_TARGET_RE.match(line):
403 result = self._process_implicit_hyperlink_target(parent, start, end, match)
404 elif match := _FIELD_START_RE.match(line):
405 result = self._process_field_list(parent, start, end, match)
406 elif (
407 start + 1 < end
408 and self._lines[start + 1].startswith(" ")
409 and not _is_blank(self._lines[start + 1])
410 ):
411 result = self._process_def_list(parent, start, end)
413 if result is None:
414 return self._process_paragraph(parent, start, end)
415 else:
416 return result, _LineEnding.NORMAL
418 def _process_title(
419 self,
420 parent: yuio.doc.Container[_t.Any],
421 title: str,
422 marker: str,
423 is_overline: bool,
424 ):
425 if level := self._headings.get((marker, is_overline)):
426 parent.items.append(yuio.doc.Heading(items=[title.strip()], level=level))
427 else:
428 level = len(self._headings) + 1
429 self._headings[(marker, is_overline)] = level
430 parent.items.append(yuio.doc.Heading(items=[title.strip()], level=level))
432 def _try_process_literal_text(
433 self, parent: yuio.doc.Container[_t.Any], start: int, end: int
434 ) -> int | None:
435 ch = self._lines[start][0]
437 if ch.isspace():
438 end = self._gather_indented_lines(start, end, True)
439 elif ch in _PUNCT:
440 end = self._gather_prefixed_lines(start, end, ch)
441 else: # pragma: no cover
442 return None
444 node = yuio.doc.Code(lines=[], syntax="text")
445 for i in range(start, end):
446 node.lines.append(self._lines[i])
447 parent.items.append(node)
449 return end
451 def _process_line_block(
452 self,
453 parent: yuio.doc.Container[_t.Any],
454 start: int,
455 end: int,
456 match: _tx.StrReMatch | None,
457 ) -> int | None:
458 block_end = start + 1
459 lines = []
460 while match:
461 self._lines[start] = match["tail"]
462 block_end = self._gather_indented_lines(start + 1, end, False)
463 lines.append(" ".join(self._lines[start:block_end]))
465 start = block_end
466 if start >= end:
467 match = None
468 else:
469 match = _LINE_BLOCK_START_RE.match(self._lines[start])
471 node = yuio.doc.Paragraph(items=["\v".join(lines)])
472 parent.items.append(node)
473 return block_end
475 def _process_bullet_list(
476 self,
477 parent: yuio.doc.Container[_t.Any],
478 start: int,
479 end: int,
480 match: _tx.StrReMatch,
481 ) -> int:
482 if (
483 parent.items
484 and isinstance(parent.items[-1], yuio.doc.List)
485 and parent.items[-1].items
486 ):
487 list_node = parent.items[-1]
488 prev_enumerator_kind = list_node.enumerator_kind
489 prev_marker_kind = list_node.marker_kind
490 prev_num = list_node.items[-1].number
491 else:
492 list_node = None
493 prev_enumerator_kind = None
494 prev_marker_kind = None
495 prev_num = None
497 enumerator_kind = match["enumerator"]
498 marker_kind = None
499 num = None
501 if (
502 enumerator_kind != prev_enumerator_kind
503 or marker_kind != prev_marker_kind
504 or (prev_num is not None)
505 ):
506 list_node = None
508 if list_node is None:
509 list_node = yuio.doc.List(
510 items=[], enumerator_kind=enumerator_kind, marker_kind=marker_kind
511 )
512 parent.items.append(list_node)
514 self._lines[start] = match["tail"]
515 if not match["space"]:
516 end = self._gather_indented_lines(start + 1, end, True)
517 else:
518 indent = len(match["indent"])
519 end = self._gather_exactly_indented_lines(start + 1, end, indent, True)
521 node = yuio.doc.ListItem(items=[], number=num)
522 self._process_block(node, start, end)
523 list_node.items.append(node)
524 return end
526 def _try_process_numbered_list(
527 self,
528 parent: yuio.doc.Container[_t.Any],
529 start: int,
530 end: int,
531 match: _tx.StrReMatch,
532 ) -> int | None:
533 if (
534 parent.items
535 and isinstance(parent.items[-1], yuio.doc.List)
536 and parent.items[-1].items
537 ):
538 list_node = parent.items[-1]
539 prev_enumerator_kind = list_node.enumerator_kind
540 prev_marker_kind = list_node.marker_kind
541 prev_num = list_node.items[-1].number
542 else:
543 list_node = None
544 prev_enumerator_kind = None
545 prev_marker_kind = None
546 prev_num = None
548 list_data = _detect_num_list_type(
549 match,
550 prev_enumerator_kind,
551 prev_marker_kind,
552 prev_num,
553 )
555 if list_data is None:
556 return None # TODO: this is not covered, I don't know why
558 enumerator_kind, marker_kind, num = list_data
560 # Verify next line (if exists) is compatible
561 if start + 1 < end:
562 next_line = self._lines[start + 1]
563 if not (
564 not next_line
565 or next_line.startswith(" ")
566 or _is_list_start(next_line, enumerator_kind, marker_kind, num)
567 ):
568 return None
570 if (
571 enumerator_kind != prev_enumerator_kind
572 or marker_kind != prev_marker_kind
573 or (prev_num is None or num != prev_num + 1)
574 ):
575 list_node = None
577 if list_node is None:
578 list_node = yuio.doc.List(
579 items=[], enumerator_kind=enumerator_kind, marker_kind=marker_kind
580 )
581 parent.items.append(list_node)
583 self._lines[start] = match["tail"]
584 if not match["space"]:
585 end = self._gather_indented_lines(start + 1, end, True)
586 else:
587 indent = len(match["indent"])
588 end = self._gather_exactly_indented_lines(start + 1, end, indent, True)
590 node = yuio.doc.ListItem(items=[], number=num)
591 self._process_block(node, start, end)
592 list_node.items.append(node)
593 return end
595 def _process_doctest_block(
596 self, parent: yuio.doc.Container[_t.Any], start: int, end: int
597 ) -> int | None:
598 node = yuio.doc.Code(lines=[], syntax="python")
600 block_end = 0
601 for i in range(start, end):
602 line = self._lines[i]
603 if _is_blank(line):
604 break
605 node.lines.append(line)
606 block_end = i + 1
608 parent.items.append(node)
609 return block_end
611 def _try_process_explicit_markup(
612 self,
613 parent: yuio.doc.Container[_t.Any],
614 start: int,
615 end: int,
616 match: _tx.StrReMatch,
617 ) -> int | None:
618 """Try to process explicit markup (directives, comments, etc.)."""
619 content = match["tail"].strip()
621 if not content:
622 start += 1
623 if start < end and not _is_blank(self._lines[start]):
624 return self._gather_indented_lines(start + 1, end, True)
625 else:
626 return start
628 if content.startswith("["):
629 return self._parse_footnote(parent, start, end, content)
631 if content.startswith("|"):
632 # TODO: save substitution
633 return self._gather_indented_lines(start + 1, end, False)
635 if content.startswith("_"):
636 return self._parse_hyperlink_target(start, end, content)
638 # Directive
639 if "::" in content:
640 return self._parse_directive(parent, start, end, content)
642 # Default to comment
643 return self._gather_indented_lines(start + 1, end, True)
645 def _parse_hyperlink_target(self, start: int, end: int, content: str):
646 end = self._gather_indented_lines(start + 1, end, False)
647 content += "\n".join(self._lines[start + 1 : end])
648 anchor, _, target = content[1:].partition(":")
649 anchor = _normalize_hyperlink_anchor(anchor)
650 target, is_redirect = _normalize_hyperlink_target(target)
651 self._add_link(
652 _Hyperlink(
653 start,
654 end,
655 anchor,
656 "redirect" if is_redirect else "link",
657 target,
658 )
659 )
660 return end
662 def _parse_footnote(
663 self, parent: yuio.doc.Container[_t.Any], start: int, end: int, content: str
664 ):
665 end = self._gather_indented_lines(start + 1, end, True)
666 name, _, content = content[1:].partition("]")
667 self._lines[start] = content.strip()
669 if name.startswith("#"):
670 name = name[1:]
671 while True:
672 auto_name = str(self._last_numbered_footnote)
673 self._last_numbered_footnote += 1
674 if auto_name not in self._targets:
675 break
676 if not name:
677 self._auto_numbered_footnotes.append(auto_name)
678 elif name.startswith("*"):
679 name = name[1:]
680 while True:
681 auto_name = _char_footnote(self._last_character_footnote)
682 self._last_character_footnote += 1
683 if auto_name not in self._targets:
684 break
685 if not name:
686 self._auto_character_footnotes.append(auto_name)
687 else:
688 auto_name = name
690 link = _Hyperlink(start, end, auto_name, "footnote", auto_name)
691 self._add_link(link)
692 if name and name not in self._targets:
693 self._targets[name] = link
695 if parent.items and isinstance(parent.items[-1], yuio.doc.FootnoteContainer):
696 container = parent.items[-1]
697 else:
698 container = yuio.doc.FootnoteContainer(items=[])
699 parent.items.append(container)
701 node = yuio.doc.Footnote(
702 items=[],
703 marker=auto_name,
704 )
705 self._process_block(node, start, end)
706 container.items.append(node)
708 return end
710 def _add_link(self, link: _Hyperlink):
711 if link.content:
712 start = link.start
713 for prev_link in reversed(self._links):
714 if prev_link.content:
715 break
716 if not (
717 prev_link.end == start
718 or all(
719 _is_blank(line) for line in self._lines[prev_link.end : start]
720 )
721 ):
722 break
723 prev_link.type = link.type
724 prev_link.content = link.content
725 start = prev_link.start
726 self._links.append(link)
727 if link.name == "_":
728 self._anonymous_links.append(link)
729 elif link.name not in self._targets:
730 self._targets[link.name] = link
732 def _parse_directive(
733 self, parent: yuio.doc.Container[_t.Any], start: int, end: int, content: str
734 ) -> int:
735 name, _, arg = content.partition("::")
736 name = name.strip()
737 arg = arg.strip()
739 end = self._gather_indented_lines(start + 1, end, True)
741 i = start + 1
743 # Parse arguments and options.
744 while i < end:
745 arg_line = self._lines[i]
746 i += 1
747 if _is_blank(arg_line):
748 break
750 parent.items.extend(
751 yuio.doc._process_directive(
752 name,
753 arg,
754 lambda: self._lines[i:end],
755 lambda: self._process_block(yuio.doc.Document(items=[]), i, end).items,
756 )
757 )
759 return end
761 def _process_block_quote(
762 self, parent: yuio.doc.Container[_t.Any], start: int, end: int
763 ) -> int:
764 end = self._gather_indented_lines(start, end, True)
765 node = yuio.doc.Quote(items=[])
766 self._process_block(node, start, end)
767 parent.items.append(node)
768 return end
770 def _process_implicit_hyperlink_target(
771 self,
772 parent: yuio.doc.Container[_t.Any],
773 start: int,
774 end: int,
775 match: _tx.StrReMatch,
776 ) -> int:
777 return self._parse_hyperlink_target(start, end, f"__: {match.group('tail')}")
779 def _process_field_list(
780 self,
781 parent: yuio.doc.Container[_t.Any],
782 start: int,
783 end: int,
784 match: _tx.StrReMatch,
785 ) -> int:
786 self._lines[start] = match["tail"]
787 end = self._gather_indented_lines(start + 1, end, True)
788 node = yuio.doc.Admonition(
789 items=[],
790 title=[match["content"].strip() + "\\ :"],
791 type="field",
792 )
793 self._process_block(node, start, end)
794 parent.items.append(node)
795 return end
797 def _process_def_list(
798 self, parent: yuio.doc.Container[_t.Any], start: int, end: int
799 ) -> int:
800 end = self._gather_indented_lines(start + 1, end, True)
801 node = yuio.doc.Admonition(
802 items=[],
803 title=[self._lines[start].strip()],
804 type="definition",
805 )
806 self._process_block(node, start + 1, end)
807 parent.items.append(node)
808 return end
810 def _process_paragraph(
811 self, parent: yuio.doc.Container[_t.Any], start: int, end: int
812 ) -> tuple[int, _LineEnding]:
813 end = self._gather_exactly_indented_lines(start, end, 0, False)
814 if end == start + 1 and self._lines[start].strip() == "::":
815 return end, _LineEnding.LITERAL_MARK
816 elif end == start + 1 and _is_transition(self._lines[start]):
817 parent.items.append(yuio.doc.ThematicBreak())
818 return end, _LineEnding.NORMAL
819 elif end > start and self._lines[end - 1].rstrip().endswith("::"):
820 line_ending = _LineEnding.LITERAL_MARK
821 self._lines[end - 1] = self._lines[end - 1].rstrip()[:-1]
822 else:
823 line_ending = _LineEnding.NORMAL
824 node = yuio.doc.Paragraph(
825 items=_t.cast(list[str | yuio.doc.TextRegion], self._lines[start:end])
826 )
827 parent.items.append(node)
828 return end, line_ending
830 def _gather_indented_lines(self, start: int, end: int, allow_blank: bool) -> int:
831 if start >= end:
832 return start
834 common_indent = None
835 result_end = start
837 for i in range(start, end):
838 line = self._lines[i]
839 if _is_blank(line):
840 if allow_blank:
841 continue
842 else:
843 break
845 indent = len(line) - len(line.lstrip())
846 if indent >= 1:
847 result_end = i + 1
848 if common_indent is None:
849 common_indent = indent
850 else:
851 common_indent = min(common_indent, indent)
852 else:
853 break
855 if common_indent:
856 for i in range(start, result_end):
857 self._lines[i] = self._lines[i][common_indent:]
859 return result_end
861 def _gather_exactly_indented_lines(
862 self, start: int, end: int, min_indent: int, allow_blank: bool
863 ) -> int:
864 result_end = start
866 for i in range(start, end):
867 line = self._lines[i]
868 if _is_blank(line):
869 if allow_blank:
870 continue
871 else:
872 break
874 if not min_indent:
875 result_end = i + 1
876 elif len(line) - len(line.lstrip()) >= min_indent:
877 result_end = i + 1
878 self._lines[i] = self._lines[i][min_indent:]
879 else:
880 break
882 return result_end
884 def _gather_prefixed_lines(self, start: int, end: int, prefix: str) -> int:
885 result_end = start
887 for i in range(start, end):
888 if self._lines[i] and self._lines[i][0] == prefix:
889 result_end = i + 1
890 else:
891 break
893 return result_end
896def _is_blank(line: str) -> bool:
897 return not line or line.isspace()
900def _is_transition(line: str) -> bool:
901 return len(line) >= 4 and line[0] in _PUNCT and all(c == line[0] for c in line)
904def _is_heading_underline(lines, start, end):
905 if end - start < 2:
906 return False
907 title, underline = lines[start : start + 2]
908 return (
909 title
910 and not title.startswith(" ")
911 and underline
912 and underline[0] in _PUNCT
913 and all(c == underline[0] for c in underline)
914 and len(title) <= len(underline)
915 )
918def _is_heading_overline(lines, start, end):
919 if end - start < 3:
920 return False
921 overline, title, underline = lines[start : start + 3]
922 return (
923 overline
924 and title
925 and underline
926 and overline[0] in _PUNCT
927 and overline[0] == underline[0]
928 and all(c == overline[0] for c in overline)
929 and len(title) <= len(overline)
930 and all(c == underline[0] for c in underline)
931 and len(title) <= len(underline)
932 )
935# fmt: off
936# The following code is copied from docutils/utils/punctuation_chars.py
937# Copyright 2011, 2017 Günter Milde, 2-Clause BSD license.
938# See https://sourceforge.net/p/docutils/code/HEAD/tree/trunk/docutils/docutils/utils/punctuation_chars.py.
939# See https://opensource.org/license/BSD-2-Clause.
940_OPENERS = (
941 "\"'(<\\[{\u0f3a\u0f3c\u169b\u2045\u207d\u208d\u2329\u2768"
942 "\u276a\u276c\u276e\u2770\u2772\u2774\u27c5\u27e6\u27e8\u27ea"
943 "\u27ec\u27ee\u2983\u2985\u2987\u2989\u298b\u298d\u298f\u2991"
944 "\u2993\u2995\u2997\u29d8\u29da\u29fc\u2e22\u2e24\u2e26\u2e28"
945 "\u3008\u300a\u300c\u300e\u3010\u3014\u3016\u3018\u301a\u301d"
946 "\u301d\ufd3e\ufe17\ufe35\ufe37\ufe39\ufe3b\ufe3d\ufe3f\ufe41"
947 "\ufe43\ufe47\ufe59\ufe5b\ufe5d\uff08\uff3b\uff5b\uff5f\uff62"
948 "\xab\u2018\u201c\u2039\u2e02\u2e04\u2e09\u2e0c\u2e1c\u2e20"
949 "\u201a\u201e\xbb\u2019\u201d\u203a\u2e03\u2e05\u2e0a\u2e0d"
950 "\u2e1d\u2e21\u201b\u201f"
951)
952_CLOSERS = (
953 "\"')>\\]}\u0f3b\u0f3d\u169c\u2046\u207e\u208e\u232a\u2769"
954 "\u276b\u276d\u276f\u2771\u2773\u2775\u27c6\u27e7\u27e9\u27eb"
955 "\u27ed\u27ef\u2984\u2986\u2988\u298a\u298c\u298e\u2990\u2992"
956 "\u2994\u2996\u2998\u29d9\u29db\u29fd\u2e23\u2e25\u2e27\u2e29"
957 "\u3009\u300b\u300d\u300f\u3011\u3015\u3017\u3019\u301b\u301e"
958 "\u301f\ufd3f\ufe18\ufe36\ufe38\ufe3a\ufe3c\ufe3e\ufe40\ufe42"
959 "\ufe44\ufe48\ufe5a\ufe5c\ufe5e\uff09\uff3d\uff5d\uff60\uff63"
960 "\xbb\u2019\u201d\u203a\u2e03\u2e05\u2e0a\u2e0d\u2e1d\u2e21"
961 "\u201b\u201f\xab\u2018\u201c\u2039\u2e02\u2e04\u2e09\u2e0c"
962 "\u2e1c\u2e20\u201a\u201e"
963)
964_DELIMITERS = (
965 "\\-/:\u058a\xa1\xb7\xbf\u037e\u0387\u055a-\u055f\u0589"
966 "\u05be\u05c0\u05c3\u05c6\u05f3\u05f4\u0609\u060a\u060c"
967 "\u060d\u061b\u061e\u061f\u066a-\u066d\u06d4\u0700-\u070d"
968 "\u07f7-\u07f9\u0830-\u083e\u0964\u0965\u0970\u0df4\u0e4f"
969 "\u0e5a\u0e5b\u0f04-\u0f12\u0f85\u0fd0-\u0fd4\u104a-\u104f"
970 "\u10fb\u1361-\u1368\u1400\u166d\u166e\u16eb-\u16ed\u1735"
971 "\u1736\u17d4-\u17d6\u17d8-\u17da\u1800-\u180a\u1944\u1945"
972 "\u19de\u19df\u1a1e\u1a1f\u1aa0-\u1aa6\u1aa8-\u1aad\u1b5a-"
973 "\u1b60\u1c3b-\u1c3f\u1c7e\u1c7f\u1cd3\u2010-\u2017\u2020-"
974 "\u2027\u2030-\u2038\u203b-\u203e\u2041-\u2043\u2047-"
975 "\u2051\u2053\u2055-\u205e\u2cf9-\u2cfc\u2cfe\u2cff\u2e00"
976 "\u2e01\u2e06-\u2e08\u2e0b\u2e0e-\u2e1b\u2e1e\u2e1f\u2e2a-"
977 "\u2e2e\u2e30\u2e31\u3001-\u3003\u301c\u3030\u303d\u30a0"
978 "\u30fb\ua4fe\ua4ff\ua60d-\ua60f\ua673\ua67e\ua6f2-\ua6f7"
979 "\ua874-\ua877\ua8ce\ua8cf\ua8f8-\ua8fa\ua92e\ua92f\ua95f"
980 "\ua9c1-\ua9cd\ua9de\ua9df\uaa5c-\uaa5f\uaade\uaadf\uabeb"
981 "\ufe10-\ufe16\ufe19\ufe30-\ufe32\ufe45\ufe46\ufe49-\ufe4c"
982 "\ufe50-\ufe52\ufe54-\ufe58\ufe5f-\ufe61\ufe63\ufe68\ufe6a"
983 "\ufe6b\uff01-\uff03\uff05-\uff07\uff0a\uff0c-\uff0f\uff1a"
984 "\uff1b\uff1f\uff20\uff3c\uff61\uff64\uff65"
985 "\U00010100\U00010101\U0001039f\U000103d0\U00010857"
986 "\U0001091f\U0001093f\U00010a50-\U00010a58\U00010a7f"
987 "\U00010b39-\U00010b3f\U000110bb\U000110bc\U000110be-"
988 "\U000110c1\U00012470-\U00012473"
989)
990_CLOSING_DELIMITERS = r"\\.,;!?"
991_QUOTE_PAIRS = {
992 # open char: matching closing characters # use case
993 "\xbb": "\xbb", # » » Swedish
994 "\u2018": "\u201a", # ‘ ‚ Albanian/Greek/Turkish
995 "\u2019": "\u2019", # ’ ’ Swedish
996 "\u201a": "\u2018\u2019", # ‚ ‘ German, ‚ ’ Polish
997 "\u201c": "\u201e", # “ „ Albanian/Greek/Turkish
998 "\u201e": "\u201c\u201d", # „ “ German, „ ” Polish
999 "\u201d": "\u201d", # ” ” Swedish
1000 "\u203a": "\u203a", # › › Swedish
1001}
1002def _match_chars(c1, c2):
1003 try:
1004 i = _OPENERS.index(c1)
1005 except ValueError: # c1 not in openers
1006 return False
1007 return c2 == _CLOSERS[i] or c2 in _QUOTE_PAIRS.get(c1, "")
1008# End docutils code.
1009# fmt: on
1011_OPENERS_RE = re.compile(rf"[{_OPENERS}{_DELIMITERS}]")
1012_CLOSERS_RE = re.compile(rf"[{_CLOSERS}{_DELIMITERS}{_CLOSING_DELIMITERS}]")
1015def _is_start_string(prev: str, next: str) -> bool:
1016 if next.isspace():
1017 return False
1018 if prev.isspace():
1019 return True
1020 if _match_chars(prev, next):
1021 return False
1022 # if character_level_inline_markup:
1023 # return True
1024 return _OPENERS_RE.match(prev) is not None
1027def _is_end_string(prev: str, next: str) -> bool:
1028 if prev.isspace():
1029 return False
1030 if next.isspace():
1031 return True
1032 if _match_chars(prev, next):
1033 return False
1034 # if character_level_inline_markup:
1035 # return True
1036 return _CLOSERS_RE.match(next) is not None
1039def _detect_num_list_type(
1040 match: _tx.StrReMatch,
1041 prev_enumerator_kind: yuio.doc.ListEnumeratorKind | str | None,
1042 prev_marker_kind: yuio.doc.ListMarkerKind | None,
1043 prev_num: int | None,
1044) -> tuple[yuio.doc.ListEnumeratorKind, yuio.doc.ListMarkerKind, int] | None:
1045 match (match["open_marker"], match["close_marker"]):
1046 case ("(", ")"):
1047 marker_kind = yuio.doc.ListMarkerKind.ENCLOSED
1048 case ("", ")"):
1049 marker_kind = yuio.doc.ListMarkerKind.PAREN
1050 case ("", "."):
1051 marker_kind = yuio.doc.ListMarkerKind.DOT
1052 case _:
1053 return None
1055 if (
1056 prev_enumerator_kind is not None
1057 and prev_marker_kind is not None
1058 and prev_num is not None
1059 and marker_kind == prev_marker_kind
1060 and isinstance(prev_enumerator_kind, yuio.doc.ListEnumeratorKind)
1061 ):
1062 # List continues.
1063 if match["enumerator"] == "#":
1064 return prev_enumerator_kind, prev_marker_kind, prev_num + 1
1065 match prev_enumerator_kind:
1066 case yuio.doc.ListEnumeratorKind.NUMBER:
1067 expected_enumerator = str(prev_num + 1)
1068 case yuio.doc.ListEnumeratorKind.SMALL_LETTER:
1069 expected_enumerator = yuio.doc.to_letters(prev_num + 1)
1070 case yuio.doc.ListEnumeratorKind.CAPITAL_LETTER:
1071 expected_enumerator = yuio.doc.to_letters(prev_num + 1).upper()
1072 case yuio.doc.ListEnumeratorKind.SMALL_ROMAN:
1073 expected_enumerator = yuio.doc.to_roman(prev_num + 1)
1074 case yuio.doc.ListEnumeratorKind.CAPITAL_ROMAN:
1075 expected_enumerator = yuio.doc.to_roman(prev_num + 1).upper()
1076 if match["enumerator"].lstrip("0") == expected_enumerator:
1077 return prev_enumerator_kind, prev_marker_kind, prev_num + 1
1079 # List starts afresh.
1080 if enumerator := match["enumerator_num"]:
1081 return yuio.doc.ListEnumeratorKind.NUMBER, marker_kind, int(enumerator)
1082 elif enumerator := match["enumerator_auto"]:
1083 return yuio.doc.ListEnumeratorKind.NUMBER, marker_kind, 1
1084 elif enumerator := match["enumerator_lowercase"]:
1085 if (enumerator == "i" or len(enumerator) > 1) and (
1086 (num := yuio.doc.from_roman(enumerator)) is not None
1087 ):
1088 return yuio.doc.ListEnumeratorKind.SMALL_ROMAN, marker_kind, num
1089 elif len(enumerator) > 1:
1090 return None
1091 elif (num := yuio.doc.from_letters(enumerator)) is not None:
1092 return yuio.doc.ListEnumeratorKind.SMALL_LETTER, marker_kind, num
1093 else:
1094 return None
1095 elif enumerator := match["enumerator_uppercase"]:
1096 if (enumerator == "I" or len(enumerator) > 1) and (
1097 num := yuio.doc.from_roman(enumerator)
1098 ) is not None:
1099 return yuio.doc.ListEnumeratorKind.CAPITAL_ROMAN, marker_kind, num
1100 elif len(enumerator) > 1:
1101 return None
1102 elif (num := yuio.doc.from_letters(enumerator)) is not None:
1103 return yuio.doc.ListEnumeratorKind.CAPITAL_LETTER, marker_kind, num
1104 else:
1105 return None
1107 return None
1110def _is_list_start(
1111 line: str,
1112 prev_enumerator_kind: yuio.doc.ListEnumeratorKind | str,
1113 prev_marker_kind: yuio.doc.ListMarkerKind,
1114 prev_num: int,
1115):
1116 match = _NUM_LIST_START_RE.match(line)
1117 if not match:
1118 return False
1119 list_data = _detect_num_list_type(
1120 match, prev_enumerator_kind, prev_marker_kind, prev_num
1121 )
1122 if not list_data:
1123 return False
1124 enumerator_kind, marker_kind, num = list_data
1125 return (
1126 enumerator_kind == prev_enumerator_kind
1127 and marker_kind == prev_marker_kind
1128 and num == prev_num + 1
1129 )
1132def _normalize_hyperlink_anchor(anchor: str) -> str:
1133 return _unescape(re.sub(r"\s+", " ", anchor.strip()).casefold())
1136def _normalize_hyperlink_target(target: str) -> tuple[str, bool]:
1137 is_redirect = bool(re.match(r"^(\\.|[^\\])*_$", target))
1138 target = re.sub(r"\\(.)|\s", r"\1", target)
1139 if is_redirect:
1140 target = target[:-1]
1141 return target, is_redirect
1144def _unescape(text: str) -> str:
1145 return re.sub(r"\\(?:\s|(.))", r"\1", text)
1148@dataclass(slots=True)
1149class _Token:
1150 """
1151 Token for processing inline markup.
1153 """
1155 start: int
1156 end: int
1157 kind: str
1159 _data: dict[str, _t.Any] | None = dataclasses.field(init=False, default=None)
1161 @property
1162 def data(self):
1163 if self._data is None:
1164 self._data = {}
1165 return self._data
1168class _InlineParser:
1169 def __init__(self, text: str, link_resolver: _LinkResolver) -> None:
1170 self._text: str = text
1171 self._start: int = 0
1172 self._pos: int = 0
1173 self._tokens: list[_Token] = []
1174 self._link_resolver = link_resolver
1176 def run(self) -> list[str | yuio.doc.TextRegion]:
1177 while self._fits(self._pos):
1178 self._run()
1179 if self._start < len(self._text):
1180 self._tokens.append(_Token(self._start, len(self._text), "text"))
1182 res: list[str | yuio.doc.TextRegion] = []
1183 for token in self._tokens:
1184 text = _unescape(self._text[token.start : token.end])
1185 match token.kind:
1186 case "text":
1187 res.append(text)
1188 case "em":
1189 res.append(yuio.doc.HighlightedRegion(text, color="em"))
1190 case "strong":
1191 res.append(yuio.doc.HighlightedRegion(text, color="strong"))
1192 case "formatted":
1193 res.append(token.data["content"])
1194 case "link":
1195 if title := token.data.get("title"):
1196 text = _unescape(title)
1197 res.append(yuio.doc.LinkRegion(text, url=token.data.get("url", "")))
1198 case "footnote":
1199 if content := token.data.get("content"):
1200 text = _unescape(content)
1201 text = f"[{text}]"
1202 res.append(
1203 yuio.doc.NoWrapRegion(
1204 yuio.doc.HighlightedRegion(text, color="role/footnote")
1205 )
1206 )
1207 case kind:
1208 assert False, kind
1209 return res
1211 def _fits(self, i):
1212 return i < len(self._text)
1214 def _ch_eq(self, i, cs):
1215 return self._fits(i) and self._text[i] in cs
1217 def _ch_in(self, i, cs):
1218 return self._fits(i) and self._text[i] in cs
1220 def _ch_at(self, i):
1221 if 0 <= i < len(self._text):
1222 return self._text[i]
1223 else:
1224 return " "
1226 def _eat(self, ch):
1227 start = self._pos
1228 while self._pos < len(self._text) and self._text[self._pos] == ch:
1229 self._pos += 1
1230 return self._pos - start
1232 def _eat_in(self, ch):
1233 while self._pos < len(self._text) and self._text[self._pos] in ch:
1234 self._pos += 1
1236 def _eat_not_in(self, ch):
1237 while self._pos < len(self._text) and self._text[self._pos] not in ch:
1238 self._pos += 1
1240 def _emit(
1241 self,
1242 tok_start: int,
1243 content_start: int,
1244 content_end: int,
1245 token_end: int,
1246 kind: str,
1247 ):
1248 if tok_start > self._start:
1249 self._tokens.append(_Token(self._start, tok_start, "text"))
1250 assert token_end == self._pos # sanity check
1251 self._start = self._pos
1252 token = _Token(content_start, content_end, kind)
1253 self._tokens.append(token)
1254 return token
1256 def _run(self):
1257 match self._text[self._pos]:
1258 case "\\":
1259 self._pos += 2
1260 case "`":
1261 if self._ch_eq(self._pos + 1, "`"):
1262 self._parse_inline_literal()
1263 else:
1264 self._parse_interpreted_text(
1265 prefix_role=None, prefix_role_start=None
1266 )
1267 case ":":
1268 self._parse_prefixed_interpreted_text()
1269 case "*":
1270 if self._ch_eq(self._pos + 1, "*"):
1271 self._parse_strong()
1272 else:
1273 self._parse_emphasis()
1274 case "|":
1275 self._parse_substitution()
1276 case "_":
1277 if self._ch_eq(self._pos + 1, "`"):
1278 self._parse_inline_internal_target()
1279 else:
1280 self._parse_unquoted_link()
1281 case "[":
1282 self._parse_footnote_reference()
1283 case _:
1284 self._eat_not_in("\\`:*|_[")
1286 def _scan_for_explicit_role(self) -> str | None:
1287 """
1288 Eat explicit role, leaving current position right after it. If explicit role
1289 can't be found, returns None and leaves current position untouched::
1291 text :role:`ref`
1292 │ └ position if this function succeeds
1293 └ initial position
1295 text :malformed-role
1296 │
1297 └ initial position, position if this function fails
1299 """
1301 if not self._ch_eq(self._pos, ":"): # pragma: no cover
1302 return None
1304 token_start = self._pos
1305 self._pos += 1
1306 content_start = self._pos
1308 while self._fits(self._pos):
1309 match self._text[self._pos]:
1310 case ch if ch.isalnum():
1311 self._pos += 1
1312 case ":":
1313 if self._ch_at(self._pos + 1).isalnum():
1314 # Isolated punctuation.
1315 self._pos += 1
1316 continue
1318 content_end = self._pos
1319 self._pos += 1
1321 if content_start == content_end:
1322 # Empty content is not allowed.
1323 break
1325 return self._text[content_start:content_end]
1326 case ch if ch in "-_+:," and not self._ch_in(self._pos + 1, "-_+:,"):
1327 # Isolated punctuation.
1328 self._pos += 1
1329 case _:
1330 break
1332 self._pos = token_start # Leave position as it was before.
1333 return None
1335 def _parse_inline_literal(self):
1336 """
1337 Eats and emits inline literal. If inline literal can't be parsed, advances
1338 current position one char and returns::
1340 text ``literal``
1341 │ └ position if this function succeeds
1342 └ initial position
1344 text ``literal
1345 │└ position if this function fails
1346 └ initial position
1348 """
1350 assert self._ch_eq(self._pos, "`")
1351 assert self._ch_eq(self._pos + 1, "`")
1353 token_start = self._pos
1354 self._pos += 2
1355 content_start = self._pos
1357 prev_char = self._ch_at(token_start - 1)
1358 next_char = self._ch_at(content_start)
1359 if not _is_start_string(prev_char, next_char):
1360 self._pos = content_start + 1
1361 return
1363 while self._fits(self._pos):
1364 match self._text[self._pos]:
1365 case "`" if self._ch_eq(self._pos + 1, "`"):
1366 content_end = self._pos
1367 self._pos += 2
1368 token_end = self._pos
1370 prev_char = self._ch_at(content_end - 1)
1371 next_char = self._ch_at(token_end)
1372 if not _is_end_string(prev_char, next_char):
1373 self._pos = content_end + 1 # Skip 1 char and continue.
1374 continue
1376 if content_start == content_end:
1377 # Empty content is not allowed.
1378 break
1380 token = self._emit(
1381 token_start, content_start, content_end, token_end, "formatted"
1382 )
1383 token.data["content"] = yuio.doc._process_role(
1384 self._text[content_start:content_end], "code"
1385 )
1386 return
1387 case _:
1388 self._pos += 1
1390 self._pos = content_start + 1
1392 def _parse_interpreted_text(
1393 self, prefix_role: str | None, prefix_role_start: int | None
1394 ):
1395 """
1396 Eats and emits interpreted text and its tail role or hyperlink marker.
1397 If interpreted text can't be parsed, advances current position one char
1398 and returns::
1400 text `ref`
1401 │ └ position if this function succeeds
1402 └ initial position
1404 text `ref
1405 │└ position if this function fails
1406 └ initial position
1408 text :role:`ref`
1409 │ │ └ position if this function succeeds
1410 │ └ initial position
1411 └ prefix_role_start
1413 text :role:`ref
1414 ││ └ initial position
1415 │└ position if this function fails
1416 └ prefix_role_start
1418 """
1420 assert self._ch_eq(self._pos, "`")
1422 if prefix_role_start is None:
1423 prefix_role_start = self._pos
1425 token_start = prefix_role_start
1426 self._pos += 1
1427 content_start = self._pos
1429 # TODO: are these correct bounds?
1430 prev_char = self._ch_at(token_start - 1)
1431 next_char = self._ch_at(token_start + 1)
1432 if not _is_start_string(prev_char, next_char):
1433 self._pos = content_start + 1
1434 return
1436 while self._fits(self._pos):
1437 if self._ch_eq(self._pos, "`"):
1438 content_end = self._pos
1439 self._pos += 1
1440 if self._ch_eq(self._pos, "_"):
1441 n_underscores = self._eat("_")
1442 suffix_role = None
1443 elif self._ch_eq(self._pos, ":"):
1444 suffix_role = self._scan_for_explicit_role()
1445 n_underscores = 0
1446 else:
1447 suffix_role = None
1448 n_underscores = 0
1449 token_end = self._pos
1451 # TODO: are these correct bounds?
1452 prev_char = self._ch_at(content_end - 1)
1453 next_char = self._ch_at(token_end)
1454 if not _is_end_string(prev_char, next_char):
1455 self._pos = content_end + 1
1456 continue
1458 if content_start == content_end:
1459 # Empty content is not allowed.
1460 break
1462 if n_underscores > 2:
1463 # Too many underscores.
1464 break
1466 if bool(n_underscores) + bool(prefix_role) + bool(suffix_role) > 1:
1467 # Malformed interpreted text, just skip it as-is.
1468 return
1470 if n_underscores:
1471 target, title = yuio.doc._process_link(
1472 self._text[content_start:content_end],
1473 )
1474 link = self._link_resolver.find_link(
1475 title, target, is_anonymous=n_underscores == 2
1476 )
1477 if link and link.type == "link":
1478 target = link.content
1479 else:
1480 target = None
1481 token = self._emit(
1482 token_start, content_start, content_end, token_end, "link"
1483 )
1484 token.data["url"] = target
1485 token.data["title"] = title
1486 else:
1487 token = self._emit(
1488 token_start, content_start, content_end, token_end, "formatted"
1489 )
1490 token.data["content"] = yuio.doc._process_role(
1491 self._text[content_start:content_end],
1492 prefix_role or suffix_role or "literal",
1493 )
1494 return
1495 elif self._ch_eq(self._pos, "\\"):
1496 self._pos += 2
1497 else:
1498 self._pos += 1
1500 self._pos = content_start + 1
1502 def _parse_prefixed_interpreted_text(self):
1503 assert self._ch_eq(self._pos, ":")
1505 token_start = self._pos
1506 role = self._scan_for_explicit_role()
1507 if role and self._ch_eq(self._pos, "`"):
1508 self._parse_interpreted_text(role, token_start)
1509 else:
1510 self._pos = token_start + 1
1512 def _parse_emphasis(self):
1513 assert self._ch_eq(self._pos, "*")
1515 token_start = self._pos
1516 self._pos += 1
1517 content_start = self._pos
1519 prev_char = self._ch_at(token_start - 1)
1520 next_char = self._ch_at(content_start)
1521 if not _is_start_string(prev_char, next_char):
1522 self._pos = content_start + 1
1523 return
1525 while self._fits(self._pos):
1526 if self._ch_eq(self._pos, "*"):
1527 content_end = self._pos
1528 self._pos += 1
1529 token_end = self._pos
1531 prev_char = self._ch_at(content_end - 1)
1532 next_char = self._ch_at(token_end)
1533 if not _is_end_string(prev_char, next_char):
1534 self._pos = content_end + 1
1535 continue
1537 if content_start == content_end:
1538 # Empty content is not allowed.
1539 break
1541 self._emit(token_start, content_start, content_end, token_end, "em")
1542 return
1543 elif self._ch_eq(self._pos, "\\"):
1544 self._pos += 2
1545 else:
1546 self._pos += 1
1548 self._pos = content_start + 1
1550 def _parse_strong(self):
1551 assert self._ch_eq(self._pos, "*")
1552 assert self._ch_eq(self._pos + 1, "*")
1554 token_start = self._pos
1555 self._pos += 2
1556 content_start = self._pos
1558 prev_char = self._ch_at(token_start - 1)
1559 next_char = self._ch_at(content_start)
1560 if not _is_start_string(prev_char, next_char):
1561 self._pos = content_start + 1
1562 return
1564 while self._fits(self._pos):
1565 if self._ch_eq(self._pos, "*") and self._ch_eq(self._pos + 1, "*"):
1566 content_end = self._pos
1567 self._pos += 2
1568 token_end = self._pos
1570 prev_char = self._ch_at(content_end - 1)
1571 next_char = self._ch_at(token_end)
1572 if not _is_end_string(prev_char, next_char):
1573 self._pos = content_end + 1
1574 continue
1576 if content_start == content_end:
1577 # Empty content is not allowed.
1578 break
1580 self._emit(token_start, content_start, content_end, token_end, "strong")
1581 return
1582 elif self._ch_eq(self._pos, "\\"):
1583 self._pos += 2
1584 else:
1585 self._pos += 1
1587 self._pos = content_start + 1
1589 def _parse_substitution(self):
1590 assert self._ch_eq(self._pos, "|")
1592 token_start = self._pos
1593 self._pos += 1
1594 content_start = self._pos
1596 prev_char = self._ch_at(token_start - 1)
1597 next_char = self._ch_at(content_start)
1598 if not _is_start_string(prev_char, next_char):
1599 self._pos = content_start + 1
1600 return
1602 while self._fits(self._pos):
1603 if self._ch_eq(self._pos, "|"):
1604 content_end = self._pos
1605 self._pos += 1
1606 token_end = self._pos
1608 prev_char = self._ch_at(content_end - 1)
1609 next_char = self._ch_at(token_end)
1610 if not _is_end_string(prev_char, next_char):
1611 self._pos = content_end + 1
1612 continue
1614 if content_start == content_end:
1615 # Empty content is not allowed.
1616 break
1618 # TODO: actually substitute things.
1619 self._emit(token_start, content_start, content_end, token_end, "text")
1620 return
1621 elif self._ch_eq(self._pos, "\\"):
1622 self._pos += 2
1623 else:
1624 self._pos += 1
1626 self._pos = content_start + 1
1628 def _parse_inline_internal_target(self):
1629 assert self._ch_eq(self._pos, "_")
1630 assert self._ch_eq(self._pos + 1, "`")
1632 token_start = self._pos
1633 self._pos += 2
1634 content_start = self._pos
1636 prev_char = self._ch_at(token_start - 1)
1637 next_char = self._ch_at(content_start)
1638 if not _is_start_string(prev_char, next_char):
1639 self._pos = content_start + 1
1640 return
1642 while self._fits(self._pos):
1643 if self._ch_eq(self._pos, "`"):
1644 content_end = self._pos
1645 self._pos += 1
1646 token_end = self._pos
1648 prev_char = self._ch_at(content_end - 1)
1649 next_char = self._ch_at(token_end)
1650 if not _is_end_string(prev_char, next_char):
1651 self._pos = content_end + 1
1652 continue
1654 if content_start == content_end:
1655 # Empty content is not allowed.
1656 break
1658 self._emit(token_start, content_start, content_end, token_end, "text")
1659 return
1660 elif self._ch_eq(self._pos, "\\"):
1661 self._pos += 2
1662 else:
1663 self._pos += 1
1665 self._pos = content_start + 1
1667 def _parse_footnote_reference(self):
1668 assert self._ch_eq(self._pos, "[")
1670 token_start = self._pos
1671 self._pos += 1
1672 content_start = self._pos
1674 prev_char = self._ch_at(token_start - 1)
1675 next_char = self._ch_at(content_start)
1676 if not _is_start_string(prev_char, next_char):
1677 self._pos = content_start + 1
1678 return
1680 while self._fits(self._pos):
1681 if self._ch_eq(self._pos, "]") and self._ch_eq(self._pos + 1, "_"):
1682 content_end = self._pos
1683 self._pos += 2
1684 token_end = self._pos
1686 prev_char = self._ch_at(content_end - 1)
1687 next_char = self._ch_at(token_end)
1688 if not _is_end_string(prev_char, next_char):
1689 self._pos = content_end + 1
1690 continue
1692 if content_start == content_end:
1693 # Empty content is not allowed.
1694 break
1696 target = self._link_resolver.find_link(
1697 self._text[content_start:content_end],
1698 None,
1699 is_anonymous=False,
1700 )
1701 if target and target.type == "footnote":
1702 content = target.content
1703 else:
1704 content = None
1705 token = self._emit(
1706 token_start, content_start, content_end, token_end, "footnote"
1707 )
1708 token.data["content"] = content
1709 return
1710 elif self._ch_eq(self._pos, "\\"):
1711 self._pos += 2
1712 else:
1713 self._pos += 1
1715 self._pos = content_start + 1
1717 def _parse_unquoted_link(self):
1718 content_end = self._pos
1719 n_underscores = self._eat("_")
1720 token_end = self._pos
1722 assert n_underscores > 0
1724 if n_underscores > 2:
1725 return
1727 prev_char = self._ch_at(content_end - 1)
1728 next_char = self._ch_at(token_end)
1729 if not _is_end_string(prev_char, next_char):
1730 return
1732 # Can be a link without backticks. Scan back to find its start.
1733 content_start = content_end
1734 while content_start - 1 >= self._start:
1735 match self._text[content_start - 1]:
1736 case ch if ch.isalnum():
1737 content_start -= 1
1738 case ch if ch in "-_+:," and not self._ch_in(
1739 content_start - 2, "-_+:,"
1740 ):
1741 # Isolated punctuation.
1742 content_start -= 1
1743 case _:
1744 break
1746 # Start string is empty as per RST spec.
1747 token_start = content_start
1749 prev_char = self._ch_at(token_start - 1)
1750 next_char = self._ch_at(content_start)
1751 if not _is_start_string(prev_char, next_char):
1752 return
1754 if content_start == content_end:
1755 return
1757 title = self._text[content_start:content_end]
1758 target = self._link_resolver.find_link(
1759 title,
1760 None,
1761 is_anonymous=n_underscores == 2,
1762 )
1763 if target and target.type == "link":
1764 url = target.content
1765 else:
1766 url = None
1767 token = self._emit(token_start, content_start, content_end, token_end, "link")
1768 token.data["url"] = url
1769 token.data["title"] = title
1772def parse(text: str, /, *, dedent: bool = True) -> yuio.doc.Document:
1773 """
1774 Parse a ReStructuredText document and return an AST node.
1776 :param text:
1777 text to parse. Common indentation will be removed from this string,
1778 making it suitable to use with triple quote literals.
1779 :param dedent:
1780 remove lading indent from `text`.
1781 :returns:
1782 parsed AST node.
1784 """
1786 if dedent:
1787 text = _dedent(text)
1789 return RstParser().parse(text)