Coverage for yuio/rst.py: 97%

1# Yuio project, MIT license.

3# https://github.com/taminomara/yuio/

5# You're free to copy this file to your project and edit it for your needs,

6# just keep this copyright line please :3

8"""

9Parser for ReStructuredText.

11Yuio supports all RST features except tables and option lists.

13**Supported block markup:**

15- headings,

16- numbered and bullet lists,

17- definition lists,

18- field lists,

19- literal blocks, both indented and quoted,

20- line blocks,

21- quotes,

22- doctest blocks,

23- directives,

24- hyperlink targets,

25- footnotes,

26- thematic breaks.

28**Supported roles:**

30- code:

31 ``code-block``,

32 ``sourcecode``,

33 ``code``;

34- admonitions:

35 ``attention``,

36 ``caution``,

37 ``danger``,

38 ``error``,

39 ``hint``,

40 ``important``,

41 ``note``,

42 ``seealso``,

43 ``tip``,

44 ``warning``;

45- versioning:

46 ``versionadded``,

47 ``versionchanged``,

48 ``deprecated``;

49- any other directive is rendered as un-highlighted code.

51**Supported inline syntax:**

53- emphasis (``*em*``),

54- strong emphasis (``**strong*``),

55- inline code in backticks (```code```),

56- interpreted text (```code```, ``:role:`code```),

57- hyperlink references (```text`_``, ``text_``, ```text`__``, ``text__``)

58 in terminals that can render them,

59- footnotes (``[...]_``),

60- inline internal targets and substitution references are parsed correctly,

61 but they have no effect.

63**Supported inline roles:**

65- ``flag`` for CLI flags,

66- any other role is interpreted as documentation reference with explicit titles

67 (``{py:class}`title <mod.Class>```) and shortening paths via tilde

68 (``{py:class}`~mod.Class```).

70.. autofunction:: parse

72.. autoclass:: RstParser

73 :members:

75"""

77from __future__ import annotations

79import dataclasses

80import re

81import string

82from dataclasses import dataclass

83from enum import Enum

85import yuio.doc

86from yuio.util import dedent as _dedent

88import yuio._typing_ext as _tx

89from typing import TYPE_CHECKING

91if TYPE_CHECKING:

92 import typing_extensions as _t

93else:

94 from yuio import _typing as _t

96__all__ = [

97 "RstParser",

98 "parse",

99]

100

101

102class _LineEnding(Enum):

103 NORMAL = "NORMAL"

104 LITERAL_MARK = "LITERAL_MARK" # Line ends with double colon

105

106

107_LINE_BLOCK_START_RE = re.compile(

108 r"""

109 ^

110 (?P<indent>

111 (?P<open_marker>\|)

112 (?P<space>\s+|$)

113 )

114 (?P<tail>.*)

115 """,

116 re.VERBOSE,

117)

118

119

120_BULLET_LIST_START_RE = re.compile(

121 r"""

122 ^

123 (?P<indent>

124 (?P<enumerator>[*+•‣⁃-])

125 (?P<space>\s+|$)

126 )

127 (?P<tail>.*)

128 $

129 """,

130 re.VERBOSE,

131)

132

133

134_NUM_LIST_START_RE = re.compile(

135 r"""

136 ^

137 (?P<indent>

138 (?P<open_marker>\(?)

139 (?P<enumerator>

140 (?P<enumerator_num>\d+)

141 | (?P<enumerator_auto>\#)

142 | (?P<enumerator_lowercase>[a-z]+)

143 | (?P<enumerator_uppercase>[A-Z]+)

144 )

145 (?P<close_marker>[).])

146 (?P<space>\s+|$)

147 )

148 (?P<tail>.*)

149 $

150 """,

151 re.VERBOSE,

152)

153

154

155_EXPLICIT_MARKUP_START_RE = re.compile(

156 r"""

157 ^

158 (?P<indent>

159 (?P<open_marker>\.\.)

160 (?P<space>\s+|$)

161 )

162 (?P<tail>.*)

163 $

164 """,

165 re.VERBOSE,

166)

167

168_IMPLICIT_HYPERLINK_TARGET_RE = re.compile(

169 r"""

170 ^

171 (?P<indent>

172 (?P<open_marker>__)

173 (?P<space>\s+|$)

174 )

175 (?P<tail>.*)

176 $

177 """,

178 re.VERBOSE,

179)

180

181_FIELD_START_RE = re.compile(

182 r"""

183 ^

184 (?P<indent>

185 (?P<open_marker>:)

186 (?P<content>(?:[^:\\]|\\.|:(?!\s|`))+)

187 (?P<close_marker>:)

188 (?P<space>\s+|$)

189 )

190 (?P<tail>.*)

191 $

192 """,

193 re.VERBOSE,

194)

195

196

197_PUNCT = tuple(string.punctuation)

198

199

200@dataclass(slots=True)

201class _Hyperlink:

202 start: int

203 end: int

204 name: str

205 type: _t.Literal["link", "footnote", "redirect"]

206 content: str

207

208

209class _LinkResolver:

210 def __init__(

211 self,

212 targets: dict[str, _Hyperlink],

213 anonymous_links: list[_Hyperlink],

214 auto_numbered_footnotes: list[str] = [],

215 auto_character_footnotes: list[str] = [],

216 ) -> None:

217 self._targets: dict[str, _Hyperlink] = targets

218

219 self._anonymous_links: list[_Hyperlink] = anonymous_links

220 self._current_anonymous_link = 0

221

222 self._auto_numbered_footnotes: list[str] = auto_numbered_footnotes

223 self._current_auto_numbered_footnote = 0

224 self._auto_character_footnotes: list[str] = auto_character_footnotes

225 self._current_auto_character_footnote = 0

226

227 def find_link(self, title: str, target: str | None, is_anonymous: bool):

228 if target:

229 # Process explicit target.

230 target, is_redirect = _normalize_hyperlink_target(target)

231 if is_redirect:

232 link = self._resolve_redirect(target)

233 else:

234 link = _Hyperlink(0, 0, title, "link", target)

235 if link and not is_anonymous:

236 # Save implicitly declared anchor.

237 anchor = _normalize_hyperlink_anchor(title)

238 self._targets.setdefault(anchor, link)

239 elif is_anonymous:

240 link = self._next_anonymous_link()

241 else:

242 anchor = _normalize_hyperlink_anchor(title)

243 if anchor.startswith("#"):

244 anchor = anchor[1:]

245 if not anchor:

246 anchor = self._next_auto_numbered_footnote() or ""

247 elif anchor.startswith("*"):

248 anchor = anchor[1:]

249 if not anchor:

250 anchor = self._next_auto_character_footnote() or ""

251 if not anchor:

252 return None

253 link = self._targets.get(anchor)

254 if link and link.type == "redirect":

255 link = self._resolve_redirect(link.content)

256 if not link or not link.content:

257 return None

258 else:

259 return link

260

261 def _next_anonymous_link(self):

262 if self._current_anonymous_link >= len(self._anonymous_links):

263 return None

264 link = self._anonymous_links[self._current_anonymous_link]

265 self._current_anonymous_link += 1

266 return link

267

268 def _next_auto_numbered_footnote(self):

269 if self._current_auto_numbered_footnote >= len(self._auto_numbered_footnotes):

270 return None

271 link = self._auto_numbered_footnotes[self._current_auto_numbered_footnote]

272 self._current_auto_numbered_footnote += 1

273 return link

274

275 def _next_auto_character_footnote(self):

276 if self._current_auto_character_footnote >= len(self._auto_character_footnotes):

277 return None

278 link = self._auto_character_footnotes[self._current_auto_character_footnote]

279 self._current_auto_character_footnote += 1

280 return link

281

282 def _resolve_redirect(self, target: str):

283 seen = set()

284 while target not in seen:

285 seen.add(target)

286 link = self._targets.get(target)

287 if link and link.type == "redirect":

288 target = link.content

289 elif link:

290 return link

291 return None

292

293

294_FOOTNOTE_CHARS = "*†‡§¶#♠♥♦♣"

295

296

297def _char_footnote(n: int, /) -> str:

298 assert n > 0

299 n_chars = len(_FOOTNOTE_CHARS)

300 result = ""

301 while n > 0:

302 n -= 1

303 result = _FOOTNOTE_CHARS[n % n_chars] + result

304 n //= n_chars

305 return result

306

307

308@_t.final

309class RstParser(yuio.doc.DocParser):

310 """

311 Parses subset of CommonMark/MyST.

312

313 """

314

315 def parse(self, s: str, /) -> yuio.doc.Document:

316 self._lines = s.expandtabs(tabsize=4).splitlines(keepends=False)

317 self._headings: dict[tuple[str, bool], int] = {}

318 self._links: list[_Hyperlink] = []

319 self._anonymous_links: list[_Hyperlink] = []

320 self._targets: dict[str, _Hyperlink] = {}

321 self._last_numbered_footnote = 1

322 self._last_character_footnote = 1

323 self._auto_numbered_footnotes: list[str] = []

324 self._auto_character_footnotes: list[str] = []

325

326 root = yuio.doc.Document(items=[])

327 self._process_block(root, 0, len(self._lines))

328 link_resolver = _LinkResolver(

329 self._targets,

330 self._anonymous_links,

331 self._auto_numbered_footnotes,

332 self._auto_character_footnotes,

333 )

334 yuio.doc._clean_tree(root)

335 self._process_inline_text(root, link_resolver)

336 return root

337

338 def parse_paragraph(self, s: str, /) -> list[str | yuio.doc.TextRegion]:

339 return _InlineParser(s, _LinkResolver({}, [], [], [])).run()

340

341 def _process_inline_text(

342 self, node: yuio.doc.AstBase, link_resolver: _LinkResolver

343 ):

344 if isinstance(node, yuio.doc.Admonition):

345 processor = _InlineParser("\n".join(map(str, node.title)), link_resolver)

346 node.title = processor.run()

347 if isinstance(node, yuio.doc.Text):

348 processor = _InlineParser("\n".join(map(str, node.items)), link_resolver)

349 node.items = processor.run()

350 elif isinstance(node, yuio.doc.Container):

351 for item in node.items:

352 self._process_inline_text(item, link_resolver)

353

354 def _process_block(self, parent: yuio.doc.Container[_t.Any], start: int, end: int):

355 i = start

356 prev_line_ending = _LineEnding.NORMAL

357

358 while i < end:

359 i, prev_line_ending = self._consume_block(parent, i, end, prev_line_ending)

360

361 return parent

362

363 def _consume_block(

364 self,

365 parent: yuio.doc.Container[_t.Any],

366 start: int,

367 end: int,

368 prev_line_ending: _LineEnding,

369 ) -> tuple[int, _LineEnding]:

370 if start >= end: # pragma: no cover

371 return start, prev_line_ending

372

373 line = self._lines[start]

374

375 if _is_blank(line):

376 return start + 1, prev_line_ending

377

378 result = None

379

380 if prev_line_ending == _LineEnding.LITERAL_MARK and (

381 line.startswith(" ") or line.startswith(_PUNCT)

382 ):

383 result = self._try_process_literal_text(parent, start, end)

384 elif _is_heading_underline(self._lines, start, end):

385 self._process_title(parent, line, self._lines[start + 1][0], False)

386 result = start + 2

387 elif _is_heading_overline(self._lines, start, end):

388 self._process_title(parent, self._lines[start + 1], line[0], True)

389 result = start + 3

390 elif line.startswith(">>>"):

391 result = self._process_doctest_block(parent, start, end)

392 elif line.startswith(" "):

393 result = self._process_block_quote(parent, start, end)

394 elif match := _LINE_BLOCK_START_RE.match(line):

395 result = self._process_line_block(parent, start, end, match)

396 elif match := _BULLET_LIST_START_RE.match(line):

397 result = self._process_bullet_list(parent, start, end, match)

398 elif match := _NUM_LIST_START_RE.match(line):

399 result = self._try_process_numbered_list(parent, start, end, match)

400 elif match := _EXPLICIT_MARKUP_START_RE.match(line):

401 result = self._try_process_explicit_markup(parent, start, end, match)

402 elif match := _IMPLICIT_HYPERLINK_TARGET_RE.match(line):

403 result = self._process_implicit_hyperlink_target(parent, start, end, match)

404 elif match := _FIELD_START_RE.match(line):

405 result = self._process_field_list(parent, start, end, match)

406 elif (

407 start + 1 < end

408 and self._lines[start + 1].startswith(" ")

409 and not _is_blank(self._lines[start + 1])

410 ):

411 result = self._process_def_list(parent, start, end)

412

413 if result is None:

414 return self._process_paragraph(parent, start, end)

415 else:

416 return result, _LineEnding.NORMAL

417

418 def _process_title(

419 self,

420 parent: yuio.doc.Container[_t.Any],

421 title: str,

422 marker: str,

423 is_overline: bool,

424 ):

425 if level := self._headings.get((marker, is_overline)):

426 parent.items.append(yuio.doc.Heading(items=[title.strip()], level=level))

427 else:

428 level = len(self._headings) + 1

429 self._headings[(marker, is_overline)] = level

430 parent.items.append(yuio.doc.Heading(items=[title.strip()], level=level))

431

432 def _try_process_literal_text(

433 self, parent: yuio.doc.Container[_t.Any], start: int, end: int

434 ) -> int | None:

435 ch = self._lines[start][0]

436

437 if ch.isspace():

438 end = self._gather_indented_lines(start, end, True)

439 elif ch in _PUNCT:

440 end = self._gather_prefixed_lines(start, end, ch)

441 else: # pragma: no cover

442 return None

443

444 node = yuio.doc.Code(lines=[], syntax="text")

445 for i in range(start, end):

446 node.lines.append(self._lines[i])

447 parent.items.append(node)

448

449 return end

450

451 def _process_line_block(

452 self,

453 parent: yuio.doc.Container[_t.Any],

454 start: int,

455 end: int,

456 match: _tx.StrReMatch | None,

457 ) -> int | None:

458 block_end = start + 1

459 lines = []

460 while match:

461 self._lines[start] = match["tail"]

462 block_end = self._gather_indented_lines(start + 1, end, False)

463 lines.append(" ".join(self._lines[start:block_end]))

464

465 start = block_end

466 if start >= end:

467 match = None

468 else:

469 match = _LINE_BLOCK_START_RE.match(self._lines[start])

470

471 node = yuio.doc.Paragraph(items=["\v".join(lines)])

472 parent.items.append(node)

473 return block_end

474

475 def _process_bullet_list(

476 self,

477 parent: yuio.doc.Container[_t.Any],

478 start: int,

479 end: int,

480 match: _tx.StrReMatch,

481 ) -> int:

482 if (

483 parent.items

484 and isinstance(parent.items[-1], yuio.doc.List)

485 and parent.items[-1].items

486 ):

487 list_node = parent.items[-1]

488 prev_enumerator_kind = list_node.enumerator_kind

489 prev_marker_kind = list_node.marker_kind

490 prev_num = list_node.items[-1].number

491 else:

492 list_node = None

493 prev_enumerator_kind = None

494 prev_marker_kind = None

495 prev_num = None

496

497 enumerator_kind = match["enumerator"]

498 marker_kind = None

499 num = None

500

501 if (

502 enumerator_kind != prev_enumerator_kind

503 or marker_kind != prev_marker_kind

504 or (prev_num is not None)

505 ):

506 list_node = None

507

508 if list_node is None:

509 list_node = yuio.doc.List(

510 items=[], enumerator_kind=enumerator_kind, marker_kind=marker_kind

511 )

512 parent.items.append(list_node)

513

514 self._lines[start] = match["tail"]

515 if not match["space"]:

516 end = self._gather_indented_lines(start + 1, end, True)

517 else:

518 indent = len(match["indent"])

519 end = self._gather_exactly_indented_lines(start + 1, end, indent, True)

520

521 node = yuio.doc.ListItem(items=[], number=num)

522 self._process_block(node, start, end)

523 list_node.items.append(node)

524 return end

525

526 def _try_process_numbered_list(

527 self,

528 parent: yuio.doc.Container[_t.Any],

529 start: int,

530 end: int,

531 match: _tx.StrReMatch,

532 ) -> int | None:

533 if (

534 parent.items

535 and isinstance(parent.items[-1], yuio.doc.List)

536 and parent.items[-1].items

537 ):

538 list_node = parent.items[-1]

539 prev_enumerator_kind = list_node.enumerator_kind

540 prev_marker_kind = list_node.marker_kind

541 prev_num = list_node.items[-1].number

542 else:

543 list_node = None

544 prev_enumerator_kind = None

545 prev_marker_kind = None

546 prev_num = None

547

548 list_data = _detect_num_list_type(

549 match,

550 prev_enumerator_kind,

551 prev_marker_kind,

552 prev_num,

553 )

554

555 if list_data is None:

556 return None # TODO: this is not covered, I don't know why

557

558 enumerator_kind, marker_kind, num = list_data

559

560 # Verify next line (if exists) is compatible

561 if start + 1 < end:

562 next_line = self._lines[start + 1]

563 if not (

564 not next_line

565 or next_line.startswith(" ")

566 or _is_list_start(next_line, enumerator_kind, marker_kind, num)

567 ):

568 return None

569

570 if (

571 enumerator_kind != prev_enumerator_kind

572 or marker_kind != prev_marker_kind

573 or (prev_num is None or num != prev_num + 1)

574 ):

575 list_node = None

576

577 if list_node is None:

578 list_node = yuio.doc.List(

579 items=[], enumerator_kind=enumerator_kind, marker_kind=marker_kind

580 )

581 parent.items.append(list_node)

582

583 self._lines[start] = match["tail"]

584 if not match["space"]:

585 end = self._gather_indented_lines(start + 1, end, True)

586 else:

587 indent = len(match["indent"])

588 end = self._gather_exactly_indented_lines(start + 1, end, indent, True)

589

590 node = yuio.doc.ListItem(items=[], number=num)

591 self._process_block(node, start, end)

592 list_node.items.append(node)

593 return end

594

595 def _process_doctest_block(

596 self, parent: yuio.doc.Container[_t.Any], start: int, end: int

597 ) -> int | None:

598 node = yuio.doc.Code(lines=[], syntax="python")

599

600 block_end = 0

601 for i in range(start, end):

602 line = self._lines[i]

603 if _is_blank(line):

604 break

605 node.lines.append(line)

606 block_end = i + 1

607

608 parent.items.append(node)

609 return block_end

610

611 def _try_process_explicit_markup(

612 self,

613 parent: yuio.doc.Container[_t.Any],

614 start: int,

615 end: int,

616 match: _tx.StrReMatch,

617 ) -> int | None:

618 """Try to process explicit markup (directives, comments, etc.)."""

619 content = match["tail"].strip()

620

621 if not content:

622 start += 1

623 if start < end and not _is_blank(self._lines[start]):

624 return self._gather_indented_lines(start + 1, end, True)

625 else:

626 return start

627

628 if content.startswith("["):

629 return self._parse_footnote(parent, start, end, content)

630

631 if content.startswith("|"):

632 # TODO: save substitution

633 return self._gather_indented_lines(start + 1, end, False)

634

635 if content.startswith("_"):

636 return self._parse_hyperlink_target(start, end, content)

637

638 # Directive

639 if "::" in content:

640 return self._parse_directive(parent, start, end, content)

641

642 # Default to comment

643 return self._gather_indented_lines(start + 1, end, True)

644

645 def _parse_hyperlink_target(self, start: int, end: int, content: str):

646 end = self._gather_indented_lines(start + 1, end, False)

647 content += "\n".join(self._lines[start + 1 : end])

648 anchor, _, target = content[1:].partition(":")

649 anchor = _normalize_hyperlink_anchor(anchor)

650 target, is_redirect = _normalize_hyperlink_target(target)

651 self._add_link(

652 _Hyperlink(

653 start,

654 end,

655 anchor,

656 "redirect" if is_redirect else "link",

657 target,

658 )

659 )

660 return end

661

662 def _parse_footnote(

663 self, parent: yuio.doc.Container[_t.Any], start: int, end: int, content: str

664 ):

665 end = self._gather_indented_lines(start + 1, end, True)

666 name, _, content = content[1:].partition("]")

667 self._lines[start] = content.strip()

668

669 if name.startswith("#"):

670 name = name[1:]

671 while True:

672 auto_name = str(self._last_numbered_footnote)

673 self._last_numbered_footnote += 1

674 if auto_name not in self._targets:

675 break

676 if not name:

677 self._auto_numbered_footnotes.append(auto_name)

678 elif name.startswith("*"):

679 name = name[1:]

680 while True:

681 auto_name = _char_footnote(self._last_character_footnote)

682 self._last_character_footnote += 1

683 if auto_name not in self._targets:

684 break

685 if not name:

686 self._auto_character_footnotes.append(auto_name)

687 else:

688 auto_name = name

689

690 link = _Hyperlink(start, end, auto_name, "footnote", auto_name)

691 self._add_link(link)

692 if name and name not in self._targets:

693 self._targets[name] = link

694

695 if parent.items and isinstance(parent.items[-1], yuio.doc.FootnoteContainer):

696 container = parent.items[-1]

697 else:

698 container = yuio.doc.FootnoteContainer(items=[])

699 parent.items.append(container)

700

701 node = yuio.doc.Footnote(

702 items=[],

703 marker=auto_name,

704 )

705 self._process_block(node, start, end)

706 container.items.append(node)

707

708 return end

709

710 def _add_link(self, link: _Hyperlink):

711 if link.content:

712 start = link.start

713 for prev_link in reversed(self._links):

714 if prev_link.content:

715 break

716 if not (

717 prev_link.end == start

718 or all(

719 _is_blank(line) for line in self._lines[prev_link.end : start]

720 )

721 ):

722 break

723 prev_link.type = link.type

724 prev_link.content = link.content

725 start = prev_link.start

726 self._links.append(link)

727 if link.name == "_":

728 self._anonymous_links.append(link)

729 elif link.name not in self._targets:

730 self._targets[link.name] = link

731

732 def _parse_directive(

733 self, parent: yuio.doc.Container[_t.Any], start: int, end: int, content: str

734 ) -> int:

735 name, _, arg = content.partition("::")

736 name = name.strip()

737 arg = arg.strip()

738

739 end = self._gather_indented_lines(start + 1, end, True)

740

741 i = start + 1

742

743 # Parse arguments and options.

744 while i < end:

745 arg_line = self._lines[i]

746 i += 1

747 if _is_blank(arg_line):

748 break

749

750 parent.items.extend(

751 yuio.doc._process_directive(

752 name,

753 arg,

754 lambda: self._lines[i:end],

755 lambda: self._process_block(yuio.doc.Document(items=[]), i, end).items,

756 )

757 )

758

759 return end

760

761 def _process_block_quote(

762 self, parent: yuio.doc.Container[_t.Any], start: int, end: int

763 ) -> int:

764 end = self._gather_indented_lines(start, end, True)

765 node = yuio.doc.Quote(items=[])

766 self._process_block(node, start, end)

767 parent.items.append(node)

768 return end

769

770 def _process_implicit_hyperlink_target(

771 self,

772 parent: yuio.doc.Container[_t.Any],

773 start: int,

774 end: int,

775 match: _tx.StrReMatch,

776 ) -> int:

777 return self._parse_hyperlink_target(start, end, f"__: {match.group('tail')}")

778

779 def _process_field_list(

780 self,

781 parent: yuio.doc.Container[_t.Any],

782 start: int,

783 end: int,

784 match: _tx.StrReMatch,

785 ) -> int:

786 self._lines[start] = match["tail"]

787 end = self._gather_indented_lines(start + 1, end, True)

788 node = yuio.doc.Admonition(

789 items=[],

790 title=[match["content"].strip() + "\\ :"],

791 type="field",

792 )

793 self._process_block(node, start, end)

794 parent.items.append(node)

795 return end

796

797 def _process_def_list(

798 self, parent: yuio.doc.Container[_t.Any], start: int, end: int

799 ) -> int:

800 end = self._gather_indented_lines(start + 1, end, True)

801 node = yuio.doc.Admonition(

802 items=[],

803 title=[self._lines[start].strip()],

804 type="definition",

805 )

806 self._process_block(node, start + 1, end)

807 parent.items.append(node)

808 return end

809

810 def _process_paragraph(

811 self, parent: yuio.doc.Container[_t.Any], start: int, end: int

812 ) -> tuple[int, _LineEnding]:

813 end = self._gather_exactly_indented_lines(start, end, 0, False)

814 if end == start + 1 and self._lines[start].strip() == "::":

815 return end, _LineEnding.LITERAL_MARK

816 elif end == start + 1 and _is_transition(self._lines[start]):

817 parent.items.append(yuio.doc.ThematicBreak())

818 return end, _LineEnding.NORMAL

819 elif end > start and self._lines[end - 1].rstrip().endswith("::"):

820 line_ending = _LineEnding.LITERAL_MARK

821 self._lines[end - 1] = self._lines[end - 1].rstrip()[:-1]

822 else:

823 line_ending = _LineEnding.NORMAL

824 node = yuio.doc.Paragraph(

825 items=_t.cast(list[str | yuio.doc.TextRegion], self._lines[start:end])

826 )

827 parent.items.append(node)

828 return end, line_ending

829

830 def _gather_indented_lines(self, start: int, end: int, allow_blank: bool) -> int:

831 if start >= end:

832 return start

833

834 common_indent = None

835 result_end = start

836

837 for i in range(start, end):

838 line = self._lines[i]

839 if _is_blank(line):

840 if allow_blank:

841 continue

842 else:

843 break

844

845 indent = len(line) - len(line.lstrip())

846 if indent >= 1:

847 result_end = i + 1

848 if common_indent is None:

849 common_indent = indent

850 else:

851 common_indent = min(common_indent, indent)

852 else:

853 break

854

855 if common_indent:

856 for i in range(start, result_end):

857 self._lines[i] = self._lines[i][common_indent:]

858

859 return result_end

860

861 def _gather_exactly_indented_lines(

862 self, start: int, end: int, min_indent: int, allow_blank: bool

863 ) -> int:

864 result_end = start

865

866 for i in range(start, end):

867 line = self._lines[i]

868 if _is_blank(line):

869 if allow_blank:

870 continue

871 else:

872 break

873

874 if not min_indent:

875 result_end = i + 1

876 elif len(line) - len(line.lstrip()) >= min_indent:

877 result_end = i + 1

878 self._lines[i] = self._lines[i][min_indent:]

879 else:

880 break

881

882 return result_end

883

884 def _gather_prefixed_lines(self, start: int, end: int, prefix: str) -> int:

885 result_end = start

886

887 for i in range(start, end):

888 if self._lines[i] and self._lines[i][0] == prefix:

889 result_end = i + 1

890 else:

891 break

892

893 return result_end

894

895

896def _is_blank(line: str) -> bool:

897 return not line or line.isspace()

898

899

900def _is_transition(line: str) -> bool:

901 return len(line) >= 4 and line[0] in _PUNCT and all(c == line[0] for c in line)

902

903

904def _is_heading_underline(lines, start, end):

905 if end - start < 2:

906 return False

907 title, underline = lines[start : start + 2]

908 return (

909 title

910 and not title.startswith(" ")

911 and underline

912 and underline[0] in _PUNCT

913 and all(c == underline[0] for c in underline)

914 and len(title) <= len(underline)

915 )

916

917

918def _is_heading_overline(lines, start, end):

919 if end - start < 3:

920 return False

921 overline, title, underline = lines[start : start + 3]

922 return (

923 overline

924 and title

925 and underline

926 and overline[0] in _PUNCT

927 and overline[0] == underline[0]

928 and all(c == overline[0] for c in overline)

929 and len(title) <= len(overline)

930 and all(c == underline[0] for c in underline)

931 and len(title) <= len(underline)

932 )

933

934

935# fmt: off

936# The following code is copied from docutils/utils/punctuation_chars.py

938# See https://sourceforge.net/p/docutils/code/HEAD/tree/trunk/docutils/docutils/utils/punctuation_chars.py.

939# See https://opensource.org/license/BSD-2-Clause.

940_OPENERS = (

941 "\"'(<\\[{\u0f3a\u0f3c\u169b\u2045\u207d\u208d\u2329\u2768"

942 "\u276a\u276c\u276e\u2770\u2772\u2774\u27c5\u27e6\u27e8\u27ea"

943 "\u27ec\u27ee\u2983\u2985\u2987\u2989\u298b\u298d\u298f\u2991"

944 "\u2993\u2995\u2997\u29d8\u29da\u29fc\u2e22\u2e24\u2e26\u2e28"

945 "\u3008\u300a\u300c\u300e\u3010\u3014\u3016\u3018\u301a\u301d"

946 "\u301d\ufd3e\ufe17\ufe35\ufe37\ufe39\ufe3b\ufe3d\ufe3f\ufe41"

947 "\ufe43\ufe47\ufe59\ufe5b\ufe5d\uff08\uff3b\uff5b\uff5f\uff62"

948 "\xab\u2018\u201c\u2039\u2e02\u2e04\u2e09\u2e0c\u2e1c\u2e20"

949 "\u201a\u201e\xbb\u2019\u201d\u203a\u2e03\u2e05\u2e0a\u2e0d"

950 "\u2e1d\u2e21\u201b\u201f"

951)

952_CLOSERS = (

953 "\"')>\\]}\u0f3b\u0f3d\u169c\u2046\u207e\u208e\u232a\u2769"

954 "\u276b\u276d\u276f\u2771\u2773\u2775\u27c6\u27e7\u27e9\u27eb"

955 "\u27ed\u27ef\u2984\u2986\u2988\u298a\u298c\u298e\u2990\u2992"

956 "\u2994\u2996\u2998\u29d9\u29db\u29fd\u2e23\u2e25\u2e27\u2e29"

957 "\u3009\u300b\u300d\u300f\u3011\u3015\u3017\u3019\u301b\u301e"

958 "\u301f\ufd3f\ufe18\ufe36\ufe38\ufe3a\ufe3c\ufe3e\ufe40\ufe42"

959 "\ufe44\ufe48\ufe5a\ufe5c\ufe5e\uff09\uff3d\uff5d\uff60\uff63"

960 "\xbb\u2019\u201d\u203a\u2e03\u2e05\u2e0a\u2e0d\u2e1d\u2e21"

961 "\u201b\u201f\xab\u2018\u201c\u2039\u2e02\u2e04\u2e09\u2e0c"

962 "\u2e1c\u2e20\u201a\u201e"

963)

964_DELIMITERS = (

965 "\\-/:\u058a\xa1\xb7\xbf\u037e\u0387\u055a-\u055f\u0589"

966 "\u05be\u05c0\u05c3\u05c6\u05f3\u05f4\u0609\u060a\u060c"

967 "\u060d\u061b\u061e\u061f\u066a-\u066d\u06d4\u0700-\u070d"

968 "\u07f7-\u07f9\u0830-\u083e\u0964\u0965\u0970\u0df4\u0e4f"

969 "\u0e5a\u0e5b\u0f04-\u0f12\u0f85\u0fd0-\u0fd4\u104a-\u104f"

970 "\u10fb\u1361-\u1368\u1400\u166d\u166e\u16eb-\u16ed\u1735"

971 "\u1736\u17d4-\u17d6\u17d8-\u17da\u1800-\u180a\u1944\u1945"

972 "\u19de\u19df\u1a1e\u1a1f\u1aa0-\u1aa6\u1aa8-\u1aad\u1b5a-"

973 "\u1b60\u1c3b-\u1c3f\u1c7e\u1c7f\u1cd3\u2010-\u2017\u2020-"

974 "\u2027\u2030-\u2038\u203b-\u203e\u2041-\u2043\u2047-"

975 "\u2051\u2053\u2055-\u205e\u2cf9-\u2cfc\u2cfe\u2cff\u2e00"

976 "\u2e01\u2e06-\u2e08\u2e0b\u2e0e-\u2e1b\u2e1e\u2e1f\u2e2a-"

977 "\u2e2e\u2e30\u2e31\u3001-\u3003\u301c\u3030\u303d\u30a0"

978 "\u30fb\ua4fe\ua4ff\ua60d-\ua60f\ua673\ua67e\ua6f2-\ua6f7"

979 "\ua874-\ua877\ua8ce\ua8cf\ua8f8-\ua8fa\ua92e\ua92f\ua95f"

980 "\ua9c1-\ua9cd\ua9de\ua9df\uaa5c-\uaa5f\uaade\uaadf\uabeb"

981 "\ufe10-\ufe16\ufe19\ufe30-\ufe32\ufe45\ufe46\ufe49-\ufe4c"

982 "\ufe50-\ufe52\ufe54-\ufe58\ufe5f-\ufe61\ufe63\ufe68\ufe6a"

983 "\ufe6b\uff01-\uff03\uff05-\uff07\uff0a\uff0c-\uff0f\uff1a"

984 "\uff1b\uff1f\uff20\uff3c\uff61\uff64\uff65"

985 "\U00010100\U00010101\U0001039f\U000103d0\U00010857"

986 "\U0001091f\U0001093f\U00010a50-\U00010a58\U00010a7f"

987 "\U00010b39-\U00010b3f\U000110bb\U000110bc\U000110be-"

988 "\U000110c1\U00012470-\U00012473"

989)

990_CLOSING_DELIMITERS = r"\\.,;!?"

991_QUOTE_PAIRS = {

992 # open char: matching closing characters # use case

993 "\xbb": "\xbb", # » » Swedish

994 "\u2018": "\u201a", # ‘ ‚ Albanian/Greek/Turkish

995 "\u2019": "\u2019", # ’ ’ Swedish

996 "\u201a": "\u2018\u2019", # ‚ ‘ German, ‚ ’ Polish

997 "\u201c": "\u201e", # “ „ Albanian/Greek/Turkish

998 "\u201e": "\u201c\u201d", # „ “ German, „ ” Polish

999 "\u201d": "\u201d", # ” ” Swedish

1000 "\u203a": "\u203a", # › › Swedish

1001}

1002def _match_chars(c1, c2):

1003 try:

1004 i = _OPENERS.index(c1)

1005 except ValueError: # c1 not in openers

1006 return False

1007 return c2 == _CLOSERS[i] or c2 in _QUOTE_PAIRS.get(c1, "")

1008# End docutils code.

1009# fmt: on

1010

1011_OPENERS_RE = re.compile(rf"[{_OPENERS}{_DELIMITERS}]")

1012_CLOSERS_RE = re.compile(rf"[{_CLOSERS}{_DELIMITERS}{_CLOSING_DELIMITERS}]")

1013

1014

1015def _is_start_string(prev: str, next: str) -> bool:

1016 if next.isspace():

1017 return False

1018 if prev.isspace():

1019 return True

1020 if _match_chars(prev, next):

1021 return False

1022 # if character_level_inline_markup:

1023 # return True

1024 return _OPENERS_RE.match(prev) is not None

1025

1026

1027def _is_end_string(prev: str, next: str) -> bool:

1028 if prev.isspace():

1029 return False

1030 if next.isspace():

1031 return True

1032 if _match_chars(prev, next):

1033 return False

1034 # if character_level_inline_markup:

1035 # return True

1036 return _CLOSERS_RE.match(next) is not None

1037

1038

1039def _detect_num_list_type(

1040 match: _tx.StrReMatch,

1041 prev_enumerator_kind: yuio.doc.ListEnumeratorKind | str | None,

1042 prev_marker_kind: yuio.doc.ListMarkerKind | None,

1043 prev_num: int | None,

1044) -> tuple[yuio.doc.ListEnumeratorKind, yuio.doc.ListMarkerKind, int] | None:

1045 match (match["open_marker"], match["close_marker"]):

1046 case ("(", ")"):

1047 marker_kind = yuio.doc.ListMarkerKind.ENCLOSED

1048 case ("", ")"):

1049 marker_kind = yuio.doc.ListMarkerKind.PAREN

1050 case ("", "."):

1051 marker_kind = yuio.doc.ListMarkerKind.DOT

1052 case _:

1053 return None

1054

1055 if (

1056 prev_enumerator_kind is not None

1057 and prev_marker_kind is not None

1058 and prev_num is not None

1059 and marker_kind == prev_marker_kind

1060 and isinstance(prev_enumerator_kind, yuio.doc.ListEnumeratorKind)

1061 ):

1062 # List continues.

1063 if match["enumerator"] == "#":

1064 return prev_enumerator_kind, prev_marker_kind, prev_num + 1

1065 match prev_enumerator_kind:

1066 case yuio.doc.ListEnumeratorKind.NUMBER:

1067 expected_enumerator = str(prev_num + 1)

1068 case yuio.doc.ListEnumeratorKind.SMALL_LETTER:

1069 expected_enumerator = yuio.doc.to_letters(prev_num + 1)

1070 case yuio.doc.ListEnumeratorKind.CAPITAL_LETTER:

1071 expected_enumerator = yuio.doc.to_letters(prev_num + 1).upper()

1072 case yuio.doc.ListEnumeratorKind.SMALL_ROMAN:

1073 expected_enumerator = yuio.doc.to_roman(prev_num + 1)

1074 case yuio.doc.ListEnumeratorKind.CAPITAL_ROMAN:

1075 expected_enumerator = yuio.doc.to_roman(prev_num + 1).upper()

1076 if match["enumerator"].lstrip("0") == expected_enumerator:

1077 return prev_enumerator_kind, prev_marker_kind, prev_num + 1

1078

1079 # List starts afresh.

1080 if enumerator := match["enumerator_num"]:

1081 return yuio.doc.ListEnumeratorKind.NUMBER, marker_kind, int(enumerator)

1082 elif enumerator := match["enumerator_auto"]:

1083 return yuio.doc.ListEnumeratorKind.NUMBER, marker_kind, 1

1084 elif enumerator := match["enumerator_lowercase"]:

1085 if (enumerator == "i" or len(enumerator) > 1) and (

1086 (num := yuio.doc.from_roman(enumerator)) is not None

1087 ):

1088 return yuio.doc.ListEnumeratorKind.SMALL_ROMAN, marker_kind, num

1089 elif len(enumerator) > 1:

1090 return None

1091 elif (num := yuio.doc.from_letters(enumerator)) is not None:

1092 return yuio.doc.ListEnumeratorKind.SMALL_LETTER, marker_kind, num

1093 else:

1094 return None

1095 elif enumerator := match["enumerator_uppercase"]:

1096 if (enumerator == "I" or len(enumerator) > 1) and (

1097 num := yuio.doc.from_roman(enumerator)

1098 ) is not None:

1099 return yuio.doc.ListEnumeratorKind.CAPITAL_ROMAN, marker_kind, num

1100 elif len(enumerator) > 1:

1101 return None

1102 elif (num := yuio.doc.from_letters(enumerator)) is not None:

1103 return yuio.doc.ListEnumeratorKind.CAPITAL_LETTER, marker_kind, num

1104 else:

1105 return None

1106

1107 return None

1108

1109

1110def _is_list_start(

1111 line: str,

1112 prev_enumerator_kind: yuio.doc.ListEnumeratorKind | str,

1113 prev_marker_kind: yuio.doc.ListMarkerKind,

1114 prev_num: int,

1115):

1116 match = _NUM_LIST_START_RE.match(line)

1117 if not match:

1118 return False

1119 list_data = _detect_num_list_type(

1120 match, prev_enumerator_kind, prev_marker_kind, prev_num

1121 )

1122 if not list_data:

1123 return False

1124 enumerator_kind, marker_kind, num = list_data

1125 return (

1126 enumerator_kind == prev_enumerator_kind

1127 and marker_kind == prev_marker_kind

1128 and num == prev_num + 1

1129 )

1130

1131

1132def _normalize_hyperlink_anchor(anchor: str) -> str:

1133 return _unescape(re.sub(r"\s+", " ", anchor.strip()).casefold())

1134

1135

1136def _normalize_hyperlink_target(target: str) -> tuple[str, bool]:

1137 is_redirect = bool(re.match(r"^(\\.|[^\\])*_$", target))

1138 target = re.sub(r"\\(.)|\s", r"\1", target)

1139 if is_redirect:

1140 target = target[:-1]

1141 return target, is_redirect

1142

1143

1144def _unescape(text: str) -> str:

1145 return re.sub(r"\\(?:\s|(.))", r"\1", text)

1146

1147

1148@dataclass(slots=True)

1149class _Token:

1150 """

1151 Token for processing inline markup.

1152

1153 """

1154

1155 start: int

1156 end: int

1157 kind: str

1158

1159 _data: dict[str, _t.Any] | None = dataclasses.field(init=False, default=None)

1160

1161 @property

1162 def data(self):

1163 if self._data is None:

1164 self._data = {}

1165 return self._data

1166

1167

1168class _InlineParser:

1169 def __init__(self, text: str, link_resolver: _LinkResolver) -> None:

1170 self._text: str = text

1171 self._start: int = 0

1172 self._pos: int = 0

1173 self._tokens: list[_Token] = []

1174 self._link_resolver = link_resolver

1175

1176 def run(self) -> list[str | yuio.doc.TextRegion]:

1177 while self._fits(self._pos):

1178 self._run()

1179 if self._start < len(self._text):

1180 self._tokens.append(_Token(self._start, len(self._text), "text"))

1181

1182 res: list[str | yuio.doc.TextRegion] = []

1183 for token in self._tokens:

1184 text = _unescape(self._text[token.start : token.end])

1185 match token.kind:

1186 case "text":

1187 res.append(text)

1188 case "em":

1189 res.append(yuio.doc.HighlightedRegion(text, color="em"))

1190 case "strong":

1191 res.append(yuio.doc.HighlightedRegion(text, color="strong"))

1192 case "formatted":

1193 res.append(token.data["content"])

1194 case "link":

1195 if title := token.data.get("title"):

1196 text = _unescape(title)

1197 res.append(yuio.doc.LinkRegion(text, url=token.data.get("url", "")))

1198 case "footnote":

1199 if content := token.data.get("content"):

1200 text = _unescape(content)

1201 text = f"[{text}]"

1202 res.append(

1203 yuio.doc.NoWrapRegion(

1204 yuio.doc.HighlightedRegion(text, color="role/footnote")

1205 )

1206 )

1207 case kind:

1208 assert False, kind

1209 return res

1210

1211 def _fits(self, i):

1212 return i < len(self._text)

1213

1214 def _ch_eq(self, i, cs):

1215 return self._fits(i) and self._text[i] in cs

1216

1217 def _ch_in(self, i, cs):

1218 return self._fits(i) and self._text[i] in cs

1219

1220 def _ch_at(self, i):

1221 if 0 <= i < len(self._text):

1222 return self._text[i]

1223 else:

1224 return " "

1225

1226 def _eat(self, ch):

1227 start = self._pos

1228 while self._pos < len(self._text) and self._text[self._pos] == ch:

1229 self._pos += 1

1230 return self._pos - start

1231

1232 def _eat_in(self, ch):

1233 while self._pos < len(self._text) and self._text[self._pos] in ch:

1234 self._pos += 1

1235

1236 def _eat_not_in(self, ch):

1237 while self._pos < len(self._text) and self._text[self._pos] not in ch:

1238 self._pos += 1

1239

1240 def _emit(

1241 self,

1242 tok_start: int,

1243 content_start: int,

1244 content_end: int,

1245 token_end: int,

1246 kind: str,

1247 ):

1248 if tok_start > self._start:

1249 self._tokens.append(_Token(self._start, tok_start, "text"))

1250 assert token_end == self._pos # sanity check

1251 self._start = self._pos

1252 token = _Token(content_start, content_end, kind)

1253 self._tokens.append(token)

1254 return token

1255

1256 def _run(self):

1257 match self._text[self._pos]:

1258 case "\\":

1259 self._pos += 2

1260 case "`":

1261 if self._ch_eq(self._pos + 1, "`"):

1262 self._parse_inline_literal()

1263 else:

1264 self._parse_interpreted_text(

1265 prefix_role=None, prefix_role_start=None

1266 )

1267 case ":":

1268 self._parse_prefixed_interpreted_text()

1269 case "*":

1270 if self._ch_eq(self._pos + 1, "*"):

1271 self._parse_strong()

1272 else:

1273 self._parse_emphasis()

1274 case "|":

1275 self._parse_substitution()

1276 case "_":

1277 if self._ch_eq(self._pos + 1, "`"):

1278 self._parse_inline_internal_target()

1279 else:

1280 self._parse_unquoted_link()

1281 case "[":

1282 self._parse_footnote_reference()

1283 case _:

1284 self._eat_not_in("\\`:*|_[")

1285

1286 def _scan_for_explicit_role(self) -> str | None:

1287 """

1288 Eat explicit role, leaving current position right after it. If explicit role

1289 can't be found, returns None and leaves current position untouched::

1290

1291 text :role:`ref`

1292 │ └ position if this function succeeds

1293 └ initial position

1294

1295 text :malformed-role

1296 │

1297 └ initial position, position if this function fails

1298

1299 """

1300

1301 if not self._ch_eq(self._pos, ":"): # pragma: no cover

1302 return None

1303

1304 token_start = self._pos

1305 self._pos += 1

1306 content_start = self._pos

1307

1308 while self._fits(self._pos):

1309 match self._text[self._pos]:

1310 case ch if ch.isalnum():

1311 self._pos += 1

1312 case ":":

1313 if self._ch_at(self._pos + 1).isalnum():

1314 # Isolated punctuation.

1315 self._pos += 1

1316 continue

1317

1318 content_end = self._pos

1319 self._pos += 1

1320

1321 if content_start == content_end:

1322 # Empty content is not allowed.

1323 break

1324

1325 return self._text[content_start:content_end]

1326 case ch if ch in "-_+:," and not self._ch_in(self._pos + 1, "-_+:,"):

1327 # Isolated punctuation.

1328 self._pos += 1

1329 case _:

1330 break

1331

1332 self._pos = token_start # Leave position as it was before.

1333 return None

1334

1335 def _parse_inline_literal(self):

1336 """

1337 Eats and emits inline literal. If inline literal can't be parsed, advances

1338 current position one char and returns::

1339

1340 text ``literal``

1341 │ └ position if this function succeeds

1342 └ initial position

1343

1344 text ``literal

1345 │└ position if this function fails

1346 └ initial position

1347

1348 """

1349

1350 assert self._ch_eq(self._pos, "`")

1351 assert self._ch_eq(self._pos + 1, "`")

1352

1353 token_start = self._pos

1354 self._pos += 2

1355 content_start = self._pos

1356

1357 prev_char = self._ch_at(token_start - 1)

1358 next_char = self._ch_at(content_start)

1359 if not _is_start_string(prev_char, next_char):

1360 self._pos = content_start + 1

1361 return

1362

1363 while self._fits(self._pos):

1364 match self._text[self._pos]:

1365 case "`" if self._ch_eq(self._pos + 1, "`"):

1366 content_end = self._pos

1367 self._pos += 2

1368 token_end = self._pos

1369

1370 prev_char = self._ch_at(content_end - 1)

1371 next_char = self._ch_at(token_end)

1372 if not _is_end_string(prev_char, next_char):

1373 self._pos = content_end + 1 # Skip 1 char and continue.

1374 continue

1375

1376 if content_start == content_end:

1377 # Empty content is not allowed.

1378 break

1379

1380 token = self._emit(

1381 token_start, content_start, content_end, token_end, "formatted"

1382 )

1383 token.data["content"] = yuio.doc._process_role(

1384 self._text[content_start:content_end], "code"

1385 )

1386 return

1387 case _:

1388 self._pos += 1

1389

1390 self._pos = content_start + 1

1391

1392 def _parse_interpreted_text(

1393 self, prefix_role: str | None, prefix_role_start: int | None

1394 ):

1395 """

1396 Eats and emits interpreted text and its tail role or hyperlink marker.

1397 If interpreted text can't be parsed, advances current position one char

1398 and returns::

1399

1400 text `ref`

1401 │ └ position if this function succeeds

1402 └ initial position

1403

1404 text `ref

1405 │└ position if this function fails

1406 └ initial position

1407

1408 text :role:`ref`

1409 │ │ └ position if this function succeeds

1410 │ └ initial position

1411 └ prefix_role_start

1412

1413 text :role:`ref

1414 ││ └ initial position

1415 │└ position if this function fails

1416 └ prefix_role_start

1417

1418 """

1419

1420 assert self._ch_eq(self._pos, "`")

1421

1422 if prefix_role_start is None:

1423 prefix_role_start = self._pos

1424

1425 token_start = prefix_role_start

1426 self._pos += 1

1427 content_start = self._pos

1428

1429 # TODO: are these correct bounds?

1430 prev_char = self._ch_at(token_start - 1)

1431 next_char = self._ch_at(token_start + 1)

1432 if not _is_start_string(prev_char, next_char):

1433 self._pos = content_start + 1

1434 return

1435

1436 while self._fits(self._pos):

1437 if self._ch_eq(self._pos, "`"):

1438 content_end = self._pos

1439 self._pos += 1

1440 if self._ch_eq(self._pos, "_"):

1441 n_underscores = self._eat("_")

1442 suffix_role = None

1443 elif self._ch_eq(self._pos, ":"):

1444 suffix_role = self._scan_for_explicit_role()

1445 n_underscores = 0

1446 else:

1447 suffix_role = None

1448 n_underscores = 0

1449 token_end = self._pos

1450

1451 # TODO: are these correct bounds?

1452 prev_char = self._ch_at(content_end - 1)

1453 next_char = self._ch_at(token_end)

1454 if not _is_end_string(prev_char, next_char):

1455 self._pos = content_end + 1

1456 continue

1457

1458 if content_start == content_end:

1459 # Empty content is not allowed.

1460 break

1461

1462 if n_underscores > 2:

1463 # Too many underscores.

1464 break

1465

1466 if bool(n_underscores) + bool(prefix_role) + bool(suffix_role) > 1:

1467 # Malformed interpreted text, just skip it as-is.

1468 return

1469

1470 if n_underscores:

1471 target, title = yuio.doc._process_link(

1472 self._text[content_start:content_end],

1473 )

1474 link = self._link_resolver.find_link(

1475 title, target, is_anonymous=n_underscores == 2

1476 )

1477 if link and link.type == "link":

1478 target = link.content

1479 else:

1480 target = None

1481 token = self._emit(

1482 token_start, content_start, content_end, token_end, "link"

1483 )

1484 token.data["url"] = target

1485 token.data["title"] = title

1486 else:

1487 token = self._emit(

1488 token_start, content_start, content_end, token_end, "formatted"

1489 )

1490 token.data["content"] = yuio.doc._process_role(

1491 self._text[content_start:content_end],

1492 prefix_role or suffix_role or "literal",

1493 )

1494 return

1495 elif self._ch_eq(self._pos, "\\"):

1496 self._pos += 2

1497 else:

1498 self._pos += 1

1499

1500 self._pos = content_start + 1

1501

1502 def _parse_prefixed_interpreted_text(self):

1503 assert self._ch_eq(self._pos, ":")

1504

1505 token_start = self._pos

1506 role = self._scan_for_explicit_role()

1507 if role and self._ch_eq(self._pos, "`"):

1508 self._parse_interpreted_text(role, token_start)

1509 else:

1510 self._pos = token_start + 1

1511

1512 def _parse_emphasis(self):

1513 assert self._ch_eq(self._pos, "*")

1514

1515 token_start = self._pos

1516 self._pos += 1

1517 content_start = self._pos

1518

1519 prev_char = self._ch_at(token_start - 1)

1520 next_char = self._ch_at(content_start)

1521 if not _is_start_string(prev_char, next_char):

1522 self._pos = content_start + 1

1523 return

1524

1525 while self._fits(self._pos):

1526 if self._ch_eq(self._pos, "*"):

1527 content_end = self._pos

1528 self._pos += 1

1529 token_end = self._pos

1530

1531 prev_char = self._ch_at(content_end - 1)

1532 next_char = self._ch_at(token_end)

1533 if not _is_end_string(prev_char, next_char):

1534 self._pos = content_end + 1

1535 continue

1536

1537 if content_start == content_end:

1538 # Empty content is not allowed.

1539 break

1540

1541 self._emit(token_start, content_start, content_end, token_end, "em")

1542 return

1543 elif self._ch_eq(self._pos, "\\"):

1544 self._pos += 2

1545 else:

1546 self._pos += 1

1547

1548 self._pos = content_start + 1

1549

1550 def _parse_strong(self):

1551 assert self._ch_eq(self._pos, "*")

1552 assert self._ch_eq(self._pos + 1, "*")

1553

1554 token_start = self._pos

1555 self._pos += 2

1556 content_start = self._pos

1557

1558 prev_char = self._ch_at(token_start - 1)

1559 next_char = self._ch_at(content_start)

1560 if not _is_start_string(prev_char, next_char):

1561 self._pos = content_start + 1

1562 return

1563

1564 while self._fits(self._pos):

1565 if self._ch_eq(self._pos, "*") and self._ch_eq(self._pos + 1, "*"):

1566 content_end = self._pos

1567 self._pos += 2

1568 token_end = self._pos

1569

1570 prev_char = self._ch_at(content_end - 1)

1571 next_char = self._ch_at(token_end)

1572 if not _is_end_string(prev_char, next_char):

1573 self._pos = content_end + 1

1574 continue

1575

1576 if content_start == content_end:

1577 # Empty content is not allowed.

1578 break

1579

1580 self._emit(token_start, content_start, content_end, token_end, "strong")

1581 return

1582 elif self._ch_eq(self._pos, "\\"):

1583 self._pos += 2

1584 else:

1585 self._pos += 1

1586

1587 self._pos = content_start + 1

1588

1589 def _parse_substitution(self):

1590 assert self._ch_eq(self._pos, "|")

1591

1592 token_start = self._pos

1593 self._pos += 1

1594 content_start = self._pos

1595

1596 prev_char = self._ch_at(token_start - 1)

1597 next_char = self._ch_at(content_start)

1598 if not _is_start_string(prev_char, next_char):

1599 self._pos = content_start + 1

1600 return

1601

1602 while self._fits(self._pos):

1603 if self._ch_eq(self._pos, "|"):

1604 content_end = self._pos

1605 self._pos += 1

1606 token_end = self._pos

1607

1608 prev_char = self._ch_at(content_end - 1)

1609 next_char = self._ch_at(token_end)

1610 if not _is_end_string(prev_char, next_char):

1611 self._pos = content_end + 1

1612 continue

1613

1614 if content_start == content_end:

1615 # Empty content is not allowed.

1616 break

1617

1618 # TODO: actually substitute things.

1619 self._emit(token_start, content_start, content_end, token_end, "text")

1620 return

1621 elif self._ch_eq(self._pos, "\\"):

1622 self._pos += 2

1623 else:

1624 self._pos += 1

1625

1626 self._pos = content_start + 1

1627

1628 def _parse_inline_internal_target(self):

1629 assert self._ch_eq(self._pos, "_")

1630 assert self._ch_eq(self._pos + 1, "`")

1631

1632 token_start = self._pos

1633 self._pos += 2

1634 content_start = self._pos

1635

1636 prev_char = self._ch_at(token_start - 1)

1637 next_char = self._ch_at(content_start)

1638 if not _is_start_string(prev_char, next_char):

1639 self._pos = content_start + 1

1640 return

1641

1642 while self._fits(self._pos):

1643 if self._ch_eq(self._pos, "`"):

1644 content_end = self._pos

1645 self._pos += 1

1646 token_end = self._pos

1647

1648 prev_char = self._ch_at(content_end - 1)

1649 next_char = self._ch_at(token_end)

1650 if not _is_end_string(prev_char, next_char):

1651 self._pos = content_end + 1

1652 continue

1653

1654 if content_start == content_end:

1655 # Empty content is not allowed.

1656 break

1657

1658 self._emit(token_start, content_start, content_end, token_end, "text")

1659 return

1660 elif self._ch_eq(self._pos, "\\"):

1661 self._pos += 2

1662 else:

1663 self._pos += 1

1664

1665 self._pos = content_start + 1

1666

1667 def _parse_footnote_reference(self):

1668 assert self._ch_eq(self._pos, "[")

1669

1670 token_start = self._pos

1671 self._pos += 1

1672 content_start = self._pos

1673

1674 prev_char = self._ch_at(token_start - 1)

1675 next_char = self._ch_at(content_start)

1676 if not _is_start_string(prev_char, next_char):

1677 self._pos = content_start + 1

1678 return

1679

1680 while self._fits(self._pos):

1681 if self._ch_eq(self._pos, "]") and self._ch_eq(self._pos + 1, "_"):

1682 content_end = self._pos

1683 self._pos += 2

1684 token_end = self._pos

1685

1686 prev_char = self._ch_at(content_end - 1)

1687 next_char = self._ch_at(token_end)

1688 if not _is_end_string(prev_char, next_char):

1689 self._pos = content_end + 1

1690 continue

1691

1692 if content_start == content_end:

1693 # Empty content is not allowed.

1694 break

1695

1696 target = self._link_resolver.find_link(

1697 self._text[content_start:content_end],

1698 None,

1699 is_anonymous=False,

1700 )

1701 if target and target.type == "footnote":

1702 content = target.content

1703 else:

1704 content = None

1705 token = self._emit(

1706 token_start, content_start, content_end, token_end, "footnote"

1707 )

1708 token.data["content"] = content

1709 return

1710 elif self._ch_eq(self._pos, "\\"):

1711 self._pos += 2

1712 else:

1713 self._pos += 1

1714

1715 self._pos = content_start + 1

1716

1717 def _parse_unquoted_link(self):

1718 content_end = self._pos

1719 n_underscores = self._eat("_")

1720 token_end = self._pos

1721

1722 assert n_underscores > 0

1723

1724 if n_underscores > 2:

1725 return

1726

1727 prev_char = self._ch_at(content_end - 1)

1728 next_char = self._ch_at(token_end)

1729 if not _is_end_string(prev_char, next_char):

1730 return

1731

1732 # Can be a link without backticks. Scan back to find its start.

1733 content_start = content_end

1734 while content_start - 1 >= self._start:

1735 match self._text[content_start - 1]:

1736 case ch if ch.isalnum():

1737 content_start -= 1

1738 case ch if ch in "-_+:," and not self._ch_in(

1739 content_start - 2, "-_+:,"

1740 ):

1741 # Isolated punctuation.

1742 content_start -= 1

1743 case _:

1744 break

1745

1746 # Start string is empty as per RST spec.

1747 token_start = content_start

1748

1749 prev_char = self._ch_at(token_start - 1)

1750 next_char = self._ch_at(content_start)

1751 if not _is_start_string(prev_char, next_char):

1752 return

1753

1754 if content_start == content_end:

1755 return

1756

1757 title = self._text[content_start:content_end]

1758 target = self._link_resolver.find_link(

1759 title,

1760 None,

1761 is_anonymous=n_underscores == 2,

1762 )

1763 if target and target.type == "link":

1764 url = target.content

1765 else:

1766 url = None

1767 token = self._emit(token_start, content_start, content_end, token_end, "link")

1768 token.data["url"] = url

1769 token.data["title"] = title

1770

1771

1772def parse(text: str, /, *, dedent: bool = True) -> yuio.doc.Document:

1773 """

1774 Parse a ReStructuredText document and return an AST node.

1775

1776 :param text:

1777 text to parse. Common indentation will be removed from this string,

1778 making it suitable to use with triple quote literals.

1779 :param dedent:

1780 remove lading indent from `text`.

1781 :returns:

1782 parsed AST node.

1783

1784 """

1785

1786 if dedent:

1787 text = _dedent(text)

1788

1789 return RstParser().parse(text)

Coverage for yuio / rst.py: 97%

948 statements