Coverage for yuio / md.py: 90%

725 statements  

« prev     ^ index     » next       coverage.py v7.13.3, created at 2026-02-03 15:42 +0000

1# Yuio project, MIT license. 

2# 

3# https://github.com/taminomara/yuio/ 

4# 

5# You're free to copy this file to your project and edit it for your needs, 

6# just keep this copyright line please :3 

7 

8""" 

9Parser for Markdown/MyST. 

10 

11Yuio supports all CommonMark features except tables. It also supports directives 

12and interpreted text via MyST_ syntax. 

13 

14**Supported block markup:** 

15 

16- headings, 

17- numbered and bullet lists, 

18- code blocks using backticks and indentation, 

19- MyST-style code blocks using colons, 

20- code blocks containing MyST directives, 

21- quotes, 

22- hyperlink targets, 

23- thematic breaks. 

24 

25**Supported roles:** 

26 

27- code: 

28 ``code-block``, 

29 ``sourcecode``, 

30 ``code``; 

31- admonitions: 

32 ``attention``, 

33 ``caution``, 

34 ``danger``, 

35 ``error``, 

36 ``hint``, 

37 ``important``, 

38 ``note``, 

39 ``seealso``, 

40 ``tip``, 

41 ``warning``; 

42- versioning: 

43 ``versionadded``, 

44 ``versionchanged``, 

45 ``deprecated``; 

46- any other directive is rendered as un-highlighted code. 

47 

48**Supported inline syntax:** 

49 

50- emphasis (``*em*``), 

51- strong emphasis (``**strong*``), 

52- inline code in backticks (```code```), 

53- inline math (``$math$``), 

54- MyST-style interpreted text (``{role}`content```), 

55- hyperlinks (``[text](link)``, ``[text][anchor]``, ``[anchor]``) 

56 in terminals that can render them, 

57- backslash-escaping. 

58 

59**Supported inline roles:** 

60 

61- ``flag`` for CLI flags, 

62- any other role is interpreted as documentation reference with explicit titles 

63 (``{py:class}`title <mod.Class>```) and shortening paths via tilde 

64 (``{py:class}`~mod.Class```). 

65 

66.. _MyST: https://myst-parser.readthedocs.io/ 

67 

68.. autofunction:: parse 

69 

70.. autoclass:: MdParser 

71 :members: 

72 

73""" 

74 

75from __future__ import annotations 

76 

77import dataclasses 

78import re 

79import string 

80from dataclasses import dataclass 

81 

82import yuio.doc 

83from yuio.util import dedent as _dedent 

84 

85from typing import TYPE_CHECKING 

86 

87if TYPE_CHECKING: 

88 import typing_extensions as _t 

89else: 

90 from yuio import _typing as _t 

91 

92__all__ = [ 

93 "MdParser", 

94 "parse", 

95] 

96 

97 

98T = _t.TypeVar("T") 

99 

100 

101_HEADING_RE = re.compile( 

102 r""" 

103 ^ 

104 \s{0,3} # - Initial indent. 

105 (?P<marker>\#{1,6}) # - Heading marker. 

106 (?P<text>(?:\s.*?)?) # - Heading text. Unless empty, text must be separated 

107 # from the heading marker by a space. 

108 (?:(?<=\s)\#+)? # - Optional closing hashes. Must be separated from 

109 # the previous content by a space. We use lookbehind 

110 # here, because if the text is empty, the space 

111 # between heading marker and closing hashes will be 

112 # matched by the `text` group. 

113 \s* # - Closing spaces. 

114 $ 

115 """, 

116 re.VERBOSE, 

117) 

118_SETEXT_HEADING_RE = re.compile( 

119 r""" 

120 ^ 

121 (?P<indent>\s{0,3}) # - Initial indent. 

122 (?P<level>-|=) # - Heading underline. 

123 \2* # - More heading underline. 

124 \s* # - Closing spaces. 

125 $ 

126 """, 

127 re.VERBOSE, 

128) 

129_LIST_RE = re.compile( 

130 r""" 

131 ^ 

132 (?P<marker> 

133 \s{0,3} # - Initial indent. 

134 (?P<type>[-*+]) # - List marker. 

135 (?: 

136 \s(?:\s{0,3}(?=\S))? # - One mandatory and up to three optional spaces; 

137 # When there are more than three optional spaces, 

138 # we treat them as a list marker followed 

139 # by a single space, followed by a code block. 

140 | $)) # - For cases when a list starts with an empty line. 

141 (?P<text>.*) # - Text of the first line in the list. 

142 $ 

143 """, 

144 re.VERBOSE, 

145) 

146_NUMBERED_LIST_RE = re.compile( 

147 r""" 

148 ^ 

149 (?P<marker> 

150 \s{0,3} # - Initial indent. 

151 (?P<number>\d{1,9}) # - Number. 

152 (?P<type>[.:)]) # - Numbered list marker. 

153 (?: 

154 \s(?:\s{0,3}(?=\S))? # - One mandatory and up to three optional spaces; 

155 # When there are more than three optional spaces, 

156 # we treat them as a list marker followed 

157 # by a single space, followed by a code block. 

158 | $)) # - For cases when a list starts with an empty line. 

159 (?P<text>.*) # - Text of the first line in the list. 

160 $ 

161 """, 

162 re.VERBOSE, 

163) 

164_CODE_BACKTICK_RE = re.compile( 

165 r""" 

166 ^ 

167 (?P<indent>\s{0,3}) # - Initial indent. 

168 (?P<fence>```+) # - Backtick fence. 

169 (?P<syntax>[^`]*) # - Syntax, can't contain backtick. 

170 $ 

171 """, 

172 re.VERBOSE, 

173) 

174_CODE_TILDE_RE = re.compile( 

175 r""" 

176 ^ 

177 (?P<indent>\s{0,3}) # - Initial indent. 

178 (?P<fence>~~~+|:::+) # - Backtick fence. 

179 (?P<syntax>.*) # - Syntax, can be anything. 

180 $ 

181 """, 

182 re.VERBOSE, 

183) 

184_CODE_FENCE_END_RE = re.compile( 

185 r""" 

186 ^ 

187 (?P<indent>\s{0,3}) # - Initial indent. 

188 (?P<fence>~~~+|```+|:::+) # - Fence. 

189 \s* # - Closing spaces. 

190 $ 

191 """, 

192 re.VERBOSE, 

193) 

194_CODE_RE = re.compile( 

195 r""" 

196 ^ 

197 \s{4} # - Initial code indent. 

198 (?P<text>.*) # - First code line. 

199 $ 

200 """, 

201 re.VERBOSE, 

202) 

203_QUOTE_RE = re.compile( 

204 r""" 

205 ^ 

206 (?P<indent>\s{0,3}) # - Initial quote indent. 

207 > # - Quote marker. 

208 \s? # - Optional space after the marker. 

209 (?P<text>.*) # - Text of the first line in the quote. 

210 $ 

211 """, 

212 re.VERBOSE, 

213) 

214_THEMATIC_BREAK_RE = re.compile( 

215 r""" 

216 ^ 

217 (?P<indent>\s{0,3}) # - Initial quote indent. 

218 ([-*_])\s*(\2\s*){2,} # - At least three break characters separated by spaces. 

219 $ 

220 """, 

221 re.VERBOSE, 

222) 

223_LINK_ANCHOR_RE = re.compile( 

224 r""" 

225 ^ 

226 (?P<indent>\s{0,3}) # - Initial indent. 

227 \[ # - Opening marker. 

228 (?P<anchor> 

229 (?:[^\[\]]|\\.){1,999} # - Link anchor, up to 999 symbols. 

230 ) 

231 \]: # - Closing marker. 

232 (?P<href>.*) # - Url. If empty, we look for url on the next line. 

233 $ 

234 """, 

235 re.VERBOSE, 

236) 

237_MYST_DIRECTIVE_NAME_RE = re.compile( 

238 r""" 

239 ^ 

240 \{ # - Directive name starts with an opening brace. 

241 (?P<directive_name>(?: # - The actual name consists of: 

242 [a-zA-Z0-9] # - alphanumerics, 

243 | [-_+:,](?![-_+:,]) # - or isolated special characters, 

244 )+) # - and it's non-empty. 

245 \} # - It ends with a closing brace. 

246 (?P<arg>.*) # - Followed by directive arguments. 

247 $ 

248 """, 

249 re.VERBOSE, 

250) 

251_LINE_FEED_RE = re.compile(r"\r\n|\r|\n|\v\r\n|\v\r|\v\n|\v") 

252 

253 

254@dataclass(slots=True) 

255class _Token: 

256 """ 

257 Token for processing inline markup. 

258 

259 """ 

260 

261 start: int 

262 end: int 

263 kind: str 

264 

265 # Length can decrease as we use up emphasis symbols. 

266 len: int = dataclasses.field(init=False) 

267 

268 # Emphasis data. 

269 can_open: bool = False 

270 can_close: bool = False 

271 prev_delim: int = -1 

272 next_delim: int = -1 

273 

274 # Action data. 

275 _data: dict[str, _t.Any] | None = dataclasses.field(init=False, default=None) 

276 

277 def __post_init__(self): 

278 self.len = self.end - self.start 

279 

280 @property 

281 def data(self): 

282 if self._data is None: 

283 self._data = {} 

284 return self._data 

285 

286 

287@dataclass(kw_only=True, slots=True) 

288class _Default: 

289 pass 

290 

291 

292@dataclass(kw_only=True, slots=True) 

293class _List: 

294 type: str 

295 marker_len: int 

296 list: yuio.doc.List 

297 parser: MdParser 

298 number: int | None = None 

299 starts_with_empty_line: bool = False 

300 

301 

302@dataclass(kw_only=True, slots=True) 

303class _Quote: 

304 parser: MdParser 

305 

306 

307@dataclass(kw_only=True, slots=True) 

308class _Code: 

309 lines: list[str] 

310 

311 

312@dataclass(kw_only=True, slots=True) 

313class _FencedCode: 

314 indent: int 

315 fence_symbol: str 

316 fence_length: int 

317 syntax: str 

318 lines: list[str] 

319 

320 

321@dataclass(kw_only=True, slots=True) 

322class _Paragraph: 

323 lines: list[str] 

324 

325 

326@dataclass(kw_only=True, slots=True) 

327class _Anchor: 

328 anchor: str 

329 

330 

331_State: _t.TypeAlias = ( 

332 _Default | _List | _Quote | _Code | _FencedCode | _Paragraph | _Anchor 

333) 

334 

335 

336@_t.final 

337class MdParser(yuio.doc.DocParser): 

338 """ 

339 Parses subset of CommonMark/MyST. 

340 

341 """ 

342 

343 def __init__(self): 

344 self._nodes: list[yuio.doc.AstBase] = [] 

345 self._state: _State = _Default() 

346 self._anchors: dict[str, tuple[str, str]] = {} 

347 

348 def _parser(self) -> MdParser: 

349 parser = MdParser() 

350 parser._anchors = self._anchors 

351 return parser 

352 

353 @staticmethod 

354 def _is_blank(s: str) -> bool: 

355 return not s or s.isspace() 

356 

357 def parse(self, s: str) -> yuio.doc.Document: 

358 s = s.expandtabs(tabsize=4) 

359 root = self._do_parse(_LINE_FEED_RE.split(s)) 

360 yuio.doc._clean_tree(root) 

361 self._process_inline_text(root) 

362 return root 

363 

364 def parse_paragraph(self, s: str, /) -> list[str | yuio.doc.TextRegion]: 

365 return _InlineParser(s, {}).run() 

366 

367 def _do_parse(self, lines: list[str]): 

368 for line in lines: 

369 self._handle_line(line) 

370 return yuio.doc.Document(items=self._finalize()) 

371 

372 def _process_inline_text(self, node: yuio.doc.AstBase): 

373 if isinstance(node, yuio.doc.Admonition): 

374 processor = _InlineParser("\n".join(map(str, node.title)), self._anchors) 

375 node.title = processor.run() 

376 if isinstance(node, yuio.doc.Text): 

377 processor = _InlineParser("\n".join(map(str, node.items)), self._anchors) 

378 node.items = processor.run() 

379 elif isinstance(node, yuio.doc.Container): 

380 for item in node.items: 

381 self._process_inline_text(item) 

382 

383 def _handle_line(self, line: str): 

384 getattr(self, f"_handle_line_{self._state.__class__.__name__.lstrip('_')}")( 

385 line 

386 ) 

387 

388 def _handle_lazy_line(self, line: str) -> bool: 

389 return getattr( 

390 self, f"_handle_lazy_line_{self._state.__class__.__name__.lstrip('_')}" 

391 )(line) 

392 

393 def _flush(self): 

394 getattr(self, f"_flush_{self._state.__class__.__name__.lstrip('_')}")() 

395 

396 def _handle_line_List(self, line: str): 

397 assert type(self._state) is _List 

398 if self._is_blank(line) and self._state.starts_with_empty_line: 

399 self._flush_List() 

400 self._handle_line_Default(line) 

401 elif self._is_blank(line) or line[: self._state.marker_len].isspace(): 

402 self._state.parser._handle_line(line[self._state.marker_len :]) 

403 elif ( 

404 ( 

405 (match := _LIST_RE.match(line)) 

406 or (match := _NUMBERED_LIST_RE.match(line)) 

407 ) 

408 and match.group("type") == self._state.type 

409 and not _THEMATIC_BREAK_RE.match(line) 

410 ): 

411 item = yuio.doc.ListItem( 

412 items=self._state.parser._finalize(), 

413 number=self._state.number, 

414 ) 

415 self._state.list.items.append(item) 

416 marker = match.group("marker") 

417 indent = len(marker) 

418 if not marker.endswith(" "): 

419 indent += 1 

420 self._state.marker_len = indent 

421 self._state.parser._handle_line(match.group("text")) 

422 if self._state.number is not None: 

423 self._state.number += 1 

424 elif not self._state.parser._handle_lazy_line(line): 

425 self._flush_List() 

426 self._handle_line_Default(line) 

427 

428 def _handle_lazy_line_List(self, line: str) -> bool: 

429 assert type(self._state) is _List 

430 if self._state.parser._handle_lazy_line(line): 

431 return True 

432 return False 

433 

434 def _flush_List(self): 

435 assert type(self._state) is _List 

436 item = yuio.doc.ListItem( 

437 items=self._state.parser._finalize(), 

438 number=self._state.number, 

439 ) 

440 self._state.list.items.append(item) 

441 self._nodes.append(self._state.list) 

442 self._state = _Default() 

443 

444 def _handle_line_Quote(self, line: str): 

445 assert type(self._state) is _Quote 

446 if match := _QUOTE_RE.match(line): 

447 self._state.parser._handle_line(match.group("text")) 

448 elif self._is_blank(line) or not self._state.parser._handle_lazy_line(line): 

449 self._flush_Quote() 

450 self._handle_line_Default(line) 

451 

452 def _handle_lazy_line_Quote(self, line: str) -> bool: 

453 assert type(self._state) is _Quote 

454 if self._state.parser._handle_lazy_line(line): 

455 return True 

456 else: 

457 return False 

458 

459 def _flush_Quote(self): 

460 assert type(self._state) is _Quote 

461 self._nodes.append(yuio.doc.Quote(items=self._state.parser._finalize())) 

462 self._state = _Default() 

463 

464 def _handle_line_Code(self, line: str): 

465 assert type(self._state) is _Code 

466 if self._is_blank(line) or line.startswith(" "): 

467 self._state.lines.append(line[4:]) 

468 else: 

469 self._flush_Code() 

470 self._handle_line_Default(line) 

471 

472 def _handle_lazy_line_Code(self, line: str) -> bool: 

473 assert type(self._state) is _Code 

474 return False # No lazy continuations for code! 

475 

476 def _flush_Code(self): 

477 assert type(self._state) is _Code 

478 while self._state.lines and self._is_blank(self._state.lines[-1]): 

479 self._state.lines.pop() 

480 if self._state.lines: 

481 self._nodes.append( 

482 yuio.doc.Code( 

483 lines=self._state.lines, 

484 syntax="", 

485 ) 

486 ) 

487 self._state = _Default() 

488 

489 def _handle_line_FencedCode(self, line: str): 

490 assert type(self._state) is _FencedCode 

491 if ( 

492 (match := _CODE_FENCE_END_RE.match(line)) 

493 and match.group("fence")[0] == self._state.fence_symbol 

494 and len(match.group("fence")) >= self._state.fence_length 

495 ): 

496 self._flush_FencedCode() 

497 else: 

498 if self._state.indent == 0: 

499 pass 

500 elif line[: self._state.indent].isspace(): 

501 line = line[self._state.indent :] 

502 else: 

503 line = line.lstrip() 

504 self._state.lines.append(line) 

505 

506 def _handle_lazy_line_FencedCode(self, line: str) -> bool: 

507 assert type(self._state) is _FencedCode 

508 return False 

509 

510 def _flush_FencedCode(self): 

511 assert type(self._state) is _FencedCode 

512 if match := _MYST_DIRECTIVE_NAME_RE.match(self._state.syntax): 

513 # This is a MyST directive. 

514 first_actual_line = 0 

515 

516 # Parse yaml options block. 

517 if ( 

518 first_actual_line < len(self._state.lines) 

519 and self._state.lines[first_actual_line] == "---" 

520 ): 

521 first_actual_line += 1 

522 while ( 

523 first_actual_line < len(self._state.lines) 

524 and self._state.lines[first_actual_line] != "---" 

525 ): 

526 first_actual_line += 1 

527 # Parse normal options block. 

528 if first_actual_line < len(self._state.lines) and self._state.lines[ 

529 first_actual_line 

530 ].startswith(":"): 

531 first_actual_line += 1 

532 # Trim empty lines. 

533 if ( 

534 first_actual_line < len(self._state.lines) 

535 and not self._state.lines[first_actual_line].strip() 

536 ): 

537 first_actual_line += 1 

538 self._state.lines = self._state.lines[first_actual_line:] 

539 

540 name = match.group("directive_name") 

541 arg = match.group("arg").strip() 

542 else: 

543 name = "code-block" 

544 arg = self._state.syntax 

545 

546 self._nodes.extend( 

547 yuio.doc._process_directive( 

548 name, 

549 arg, 

550 lambda: self._state.lines, # type: ignore 

551 lambda: self._parser()._do_parse(self._state.lines).items, # type: ignore 

552 ) 

553 ) 

554 self._state = _Default() 

555 

556 def _handle_line_Paragraph(self, line: str): 

557 assert type(self._state) is _Paragraph 

558 if match := _SETEXT_HEADING_RE.match(line): 

559 level = 1 if match.group("level") == "=" else 2 

560 self._nodes.append( 

561 yuio.doc.Heading( 

562 items=_t.cast(list[str | yuio.doc.TextRegion], self._state.lines), 

563 level=level, 

564 ) 

565 ) 

566 self._state = _Default() 

567 elif ( 

568 self._is_blank(line) 

569 or _THEMATIC_BREAK_RE.match(line) 

570 or _HEADING_RE.match(line) 

571 or _CODE_BACKTICK_RE.match(line) 

572 or _CODE_TILDE_RE.match(line) 

573 or ( 

574 (match := _LIST_RE.match(line)) 

575 and not self._is_blank(match.group("text")) 

576 ) 

577 or ( 

578 (match := _NUMBERED_LIST_RE.match(line)) 

579 and not self._is_blank(match.group("text")) 

580 and match.group("number") == "1" 

581 ) 

582 or _QUOTE_RE.match(line) 

583 ): 

584 self._flush_Paragraph() 

585 self._handle_line_Default(line) 

586 else: 

587 self._state.lines.append(line) 

588 

589 def _handle_lazy_line_Paragraph(self, line: str) -> bool: 

590 assert type(self._state) is _Paragraph 

591 if ( 

592 self._is_blank(line) 

593 or _THEMATIC_BREAK_RE.match(line) 

594 or _HEADING_RE.match(line) 

595 or _CODE_BACKTICK_RE.match(line) 

596 or _CODE_TILDE_RE.match(line) 

597 or _LIST_RE.match(line) 

598 or _NUMBERED_LIST_RE.match(line) 

599 or _QUOTE_RE.match(line) 

600 ): 

601 self._flush_Paragraph() 

602 return False 

603 else: 

604 self._state.lines.append(line) 

605 return True 

606 

607 def _flush_Paragraph(self): 

608 assert type(self._state) is _Paragraph 

609 self._nodes.append( 

610 yuio.doc.Paragraph( 

611 items=_t.cast(list[str | yuio.doc.TextRegion], self._state.lines) 

612 ) 

613 ) 

614 self._state = _Default() 

615 

616 def _handle_line_Anchor(self, line: str): 

617 assert type(self._state) is _Anchor 

618 line = line.strip() 

619 if line: 

620 url, _ = _InlineParser.parse_link(line) 

621 if url: 

622 self._anchors.setdefault(self._state.anchor, (line, "")) 

623 else: 

624 self._nodes.append(yuio.doc.Paragraph(items=[f"[{self._state.anchor}]:"])) 

625 self._state = _Default() 

626 

627 def _handle_lazy_line_Anchor(self, line: str): 

628 assert type(self._state) is _Anchor 

629 line = line.strip() 

630 if line: 

631 url, _ = _InlineParser.parse_link(line) 

632 if url: 

633 self._anchors.setdefault(self._state.anchor, (line, "")) 

634 self._state = _Default() 

635 return True 

636 else: 

637 self._nodes.append(yuio.doc.Paragraph(items=[f"[{self._state.anchor}]:"])) 

638 self._state = _Default() 

639 return False 

640 

641 def _flush_Anchor(self): 

642 assert type(self._state) is _Anchor 

643 self._state = _Default() 

644 

645 def _handle_line_Default(self, line: str): 

646 assert type(self._state) is _Default 

647 if self._is_blank(line): 

648 pass # do nothing 

649 elif match := _LINK_ANCHOR_RE.match(line): 

650 anchor = match.group("anchor").strip() 

651 href = match.group("href").strip() 

652 if not anchor: 

653 self._state = _Paragraph(lines=[line]) 

654 elif href: 

655 url, _ = _InlineParser.parse_link(href) 

656 if url is not None: 

657 anchor = _InlineParser.norm_anchor(anchor) 

658 self._anchors.setdefault(anchor, (url, "")) 

659 else: 

660 self._state = _Paragraph(lines=[line]) 

661 else: 

662 anchor = _InlineParser.norm_anchor(anchor) 

663 self._state = _Anchor(anchor=anchor) 

664 elif _THEMATIC_BREAK_RE.match(line): 

665 self._nodes.append(yuio.doc.ThematicBreak()) 

666 elif match := _HEADING_RE.match(line): 

667 level = len(match.group("marker")) 

668 self._nodes.append( 

669 yuio.doc.Heading( 

670 items=[match.group("text").strip()], 

671 level=level, 

672 ) 

673 ) 

674 elif (match := _CODE_BACKTICK_RE.match(line)) or ( 

675 match := _CODE_TILDE_RE.match(line) 

676 ): 

677 indent = len(match.group("indent")) 

678 syntax = match.group("syntax").strip() 

679 fence_symbol = match.group("fence")[0] 

680 fence_length = len(match.group("fence")) 

681 self._state = _FencedCode( 

682 indent=indent, 

683 fence_symbol=fence_symbol, 

684 fence_length=fence_length, 

685 syntax=syntax, 

686 lines=[], 

687 ) 

688 elif match := _CODE_RE.match(line): 

689 self._state = _Code(lines=[match.group("text")]) 

690 elif (match := _LIST_RE.match(line)) or ( 

691 match := _NUMBERED_LIST_RE.match(line) 

692 ): 

693 marker = match.group("marker") 

694 indent = len(marker) 

695 if not marker.endswith(" "): 

696 indent += 1 

697 list_type = match.group("type") 

698 number_str = match.groupdict().get("number", None) 

699 number = int(number_str) if number_str else None 

700 starts_with_empty_line = self._is_blank(match.group("text")) 

701 self._state = _List( 

702 type=list_type, 

703 marker_len=indent, 

704 list=yuio.doc.List( 

705 items=[], 

706 enumerator_kind=( 

707 yuio.doc.ListEnumeratorKind.NUMBER 

708 if number is not None 

709 else None 

710 ), 

711 ), 

712 parser=self._parser(), 

713 number=number, 

714 starts_with_empty_line=starts_with_empty_line, 

715 ) 

716 self._state.parser._handle_line(match.group("text")) 

717 elif match := _QUOTE_RE.match(line): 

718 self._state = _Quote(parser=self._parser()) 

719 self._state.parser._handle_line(match.group("text")) 

720 else: 

721 self._state = _Paragraph(lines=[line]) 

722 

723 def _handle_lazy_line_Default(self, line: str) -> bool: 

724 assert type(self._state) is _Default 

725 return False 

726 

727 def _flush_Default(self): 

728 assert type(self._state) is _Default 

729 

730 def _finalize(self) -> list[yuio.doc.AstBase]: 

731 self._flush() 

732 result = self._nodes 

733 self._nodes = [] 

734 return result 

735 

736 

737_UNESCAPE_RE = re.compile(rf"\\([{re.escape(string.punctuation)}])") 

738 

739 

740class _InlineParser: 

741 # Based on https://spec.commonmark.org/0.31.2/#phase-2-inline-structure 

742 

743 def __init__(self, text: str, anchors: dict[str, tuple[str, str]]) -> None: 

744 self._text = text 

745 self._pos = 0 

746 self._anchors = anchors 

747 self._tokens: list[_Token] = [] 

748 self._link_opener_indices: list[int] = [] 

749 self._delim_first = -1 

750 self._delim_last = -1 

751 

752 @staticmethod 

753 def norm_anchor(anchor: str) -> str: 

754 return re.sub(r"\s+", " ", anchor.strip()).casefold() 

755 

756 @staticmethod 

757 def unescape(text: str) -> str: 

758 return _UNESCAPE_RE.sub(r"\1", text) 

759 

760 def run(self) -> list[str | yuio.doc.TextRegion]: 

761 while self._fits(self._pos): 

762 self._run() 

763 self._process_delims() 

764 

765 res = yuio.doc.TextRegion() 

766 stack = [res] 

767 

768 em = 0 

769 strong = 0 

770 

771 def add_text(text: str | yuio.doc.TextRegion): 

772 if not text: 

773 return 

774 colors = [] 

775 if em: 

776 colors.append("em") 

777 if strong: 

778 colors.append("strong") 

779 if colors: 

780 text = yuio.doc.HighlightedRegion(text, color=" ".join(colors)) 

781 stack[-1].content.append(text) 

782 

783 for token in self._tokens: 

784 match token.kind: 

785 case "text": 

786 text = self._text[token.start : token.start + token.len] 

787 add_text(text) 

788 case "*" | "_": 

789 em += token.data.get("em", 0) 

790 strong += token.data.get("strong", 0) 

791 text = self._text[token.start : token.start + token.len] 

792 add_text(text) 

793 case "link_start": 

794 if (url := token.data.get("url")) is not None: 

795 stack.append(yuio.doc.LinkRegion(url=url)) 

796 else: 

797 text = self._text[token.start : token.start + token.len] 

798 add_text(text) 

799 case "link_end": 

800 assert len(stack) > 1 

801 top = stack.pop() 

802 stack[-1].content.append(top) 

803 case "escape": 

804 text = self._text[token.start : token.start + token.len] 

805 if text == "\n": 

806 text = ( 

807 "\v\n" # Vertical tab forces wrapper to make a line break. 

808 ) 

809 elif not text or text not in string.punctuation: 

810 text = "\\" + text 

811 add_text(text) 

812 case "formatted": 

813 add_text(token.data["content"]) 

814 case kind: 

815 assert False, kind 

816 

817 return res.content 

818 

819 @classmethod 

820 def parse_link(cls, link: str): 

821 return cls(link + ")", {})._parse_link() 

822 

823 def _fits(self, i): 

824 return i < len(self._text) 

825 

826 def _ch_eq(self, i, cs): 

827 return self._fits(i) and self._text[i] in cs 

828 

829 def _ch_in(self, i, cs): 

830 return self._fits(i) and self._text[i] in cs 

831 

832 def _ch_at(self, i): 

833 if 0 <= i < len(self._text): 

834 return self._text[i] 

835 else: 

836 return " " 

837 

838 def _eat(self, ch): 

839 start = self._pos 

840 while self._pos < len(self._text) and self._text[self._pos] == ch: 

841 self._pos += 1 

842 return self._pos - start 

843 

844 def _eat_in(self, ch): 

845 start = self._pos 

846 while self._pos < len(self._text) and self._text[self._pos] in ch: 

847 self._pos += 1 

848 return self._pos - start 

849 

850 def _eat_not_in(self, ch): 

851 start = self._pos 

852 while self._pos < len(self._text) and self._text[self._pos] not in ch: 

853 self._pos += 1 

854 return self._pos - start 

855 

856 def _run(self): 

857 match self._text[self._pos]: 

858 case "\\": 

859 self._tokens.append(_Token(self._pos + 1, self._pos + 2, "escape")) 

860 self._pos += 2 

861 case "`": 

862 self._parse_code() 

863 case "$": 

864 self._parse_math() 

865 case "{": 

866 self._parse_role() 

867 case "!" if self._ch_eq(self._pos + 1, "["): 

868 self._push_link_start("image_start", 2) 

869 case "[": 

870 self._push_link_start("link_start", 1) 

871 case "]": 

872 self._parse_link_end() 

873 case "*" | "_": 

874 self._parse_delim_run() 

875 case "!" | "\\": 

876 self._tokens.append(_Token(self._pos, self._pos + 1, "text")) 

877 self._pos += 1 

878 case _: 

879 start = self._pos 

880 self._eat_not_in("\\`[]!*_{$") 

881 self._tokens.append(_Token(start, self._pos, "text")) 

882 

883 def _parse_role(self): 

884 start = self._pos 

885 self._pos += 1 

886 # alphanumerics plus isolated internal hyphens, underscores, plus signs, colons, and periods 

887 

888 while self._fits(self._pos): 

889 match self._text[self._pos]: 

890 case "}": 

891 self._pos += 1 

892 break 

893 case ch if ch.isalnum(): 

894 self._pos += 1 

895 case ch if ch in "-_+:," and not self._ch_in(self._pos + 1, "-_+:,"): 

896 self._pos += 1 

897 case _: 

898 self._pos = start + 1 

899 self._tokens.append(_Token(self._pos, self._pos + 1, "text")) 

900 return 

901 if self._ch_eq(self._pos, "`"): 

902 role = self._text[start + 1 : self._pos - 1] 

903 self._parse_code(role) 

904 

905 def _parse_code(self, role: str | None = None): 

906 start = self._pos 

907 n_backticks = self._eat("`") 

908 

909 end = None 

910 while self._fits(self._pos): 

911 if self._text[self._pos] == "`": 

912 n_backticks_end = self._eat("`") 

913 if n_backticks == n_backticks_end: 

914 end = self._pos 

915 break 

916 else: 

917 self._pos += 1 

918 

919 if end is None: 

920 self._tokens.append(_Token(start, start + n_backticks, "text")) 

921 self._pos = start + n_backticks 

922 else: 

923 code = self._text[start + n_backticks : end - n_backticks] 

924 if ( 

925 code.startswith((" ", "\n")) 

926 and code.endswith((" ", "\n")) 

927 and len(code) > 2 

928 ): 

929 code = code[1:-1] 

930 start += 1 

931 end -= 1 

932 token = _Token(start + n_backticks, end - n_backticks, "formatted") 

933 token.data["content"] = yuio.doc._process_role(code, role or "code") 

934 self._tokens.append(token) 

935 

936 def _parse_math(self): 

937 start = self._pos 

938 n_markers = self._eat("$") 

939 if n_markers > 2: 

940 self._tokens.append(_Token(start, self._pos, "text")) 

941 return 

942 

943 end = None 

944 while self._fits(self._pos): 

945 if self._text[self._pos] == "$": 

946 n_markers_end = self._eat("$") 

947 if n_markers == n_markers_end: 

948 end = self._pos 

949 break 

950 else: 

951 self._pos += 1 

952 

953 if end is None: 

954 self._tokens.append(_Token(start, start + n_markers, "text")) 

955 self._pos = start + n_markers 

956 else: 

957 code = self._text[start + n_markers : end - n_markers] 

958 token = _Token(start + n_markers, end - n_markers, "formatted") 

959 token.data["content"] = yuio.doc._process_role(code, "math") 

960 self._tokens.append(token) 

961 

962 def _push_link_start(self, kind, length): 

963 self._link_opener_indices.append(len(self._tokens)) 

964 self._tokens.append( 

965 _Token( 

966 self._pos, 

967 self._pos + length, 

968 kind, 

969 ) 

970 ) 

971 self._pos += length 

972 

973 def _parse_link_end(self): 

974 if not self._link_opener_indices: 

975 # No corresponding link opener. 

976 self._tokens.append(_Token(self._pos, self._pos + 1, "text")) 

977 self._pos += 1 

978 return 

979 opener_token_idx = self._link_opener_indices.pop() 

980 opener_token = self._tokens[opener_token_idx] 

981 assert opener_token.kind in ["link_start", "image_start"] 

982 

983 start = self._pos 

984 self._pos += 1 

985 

986 if self._ch_eq(self._pos, "("): 

987 self._pos += 1 

988 url, title = self._parse_link() 

989 else: 

990 if self._ch_eq(self._pos, "["): 

991 self._pos += 1 

992 anchor = self._parse_anchor() 

993 else: 

994 anchor = self._text[opener_token.end : self._pos - 1] 

995 if anchor: 

996 url, title = self._anchors.get(self.norm_anchor(anchor), (None, None)) 

997 else: 

998 url, title = None, None 

999 

1000 if url is None: 

1001 self._tokens.append(_Token(start, start + 1, "text")) 

1002 self._pos = start + 1 

1003 return 

1004 

1005 if opener_token.kind == "link_start": 

1006 close_token = _Token(start, self._pos, "link_end") 

1007 self._link_opener_indices.clear() # Prevent nested links. 

1008 else: 

1009 close_token = _Token(start, self._pos, "image_end") 

1010 opener_token.data["url"] = url 

1011 opener_token.data["title"] = title 

1012 opener_token.len = 0 

1013 close_token.data["url"] = None 

1014 close_token.data["title"] = None 

1015 close_token.len = 0 

1016 self._tokens.append(close_token) 

1017 self._process_delims(opener_token_idx) 

1018 

1019 def _parse_link(self): 

1020 if self._ch_eq(self._pos, "<"): 

1021 self._pos += 1 

1022 url = self._parse_href_angled() 

1023 else: 

1024 url = self._parse_href_bare() 

1025 if url is None: 

1026 return None, None # Href parsing failed. 

1027 if self._ch_in(self._pos, " )"): 

1028 title = self._parse_title() 

1029 if title is None: 

1030 return None, None # Title parsing failed. 

1031 else: 

1032 url = self.unescape(url) # Normal escaping rules apply. 

1033 return url, title 

1034 else: 

1035 return None, None # Href does not end with expected symbol. 

1036 

1037 def _parse_href_angled(self): 

1038 start = self._pos 

1039 while self._fits(self._pos): 

1040 match self._text[self._pos]: 

1041 case "\\" if self._ch_in(self._pos + 1, string.punctuation): 

1042 self._pos += 2 

1043 case ">": 

1044 self._pos += 1 

1045 return self._text[start : self._pos - 1] 

1046 case "<" | "\n": 

1047 break 

1048 case _: 

1049 self._pos += 1 

1050 return None 

1051 

1052 def _parse_href_bare(self): 

1053 start = self._pos 

1054 paren_level = 1 

1055 url = None 

1056 while self._fits(self._pos): 

1057 match self._text[self._pos]: 

1058 case "\\" if self._ch_in(self._pos + 1, string.punctuation): 

1059 self._pos += 2 

1060 case ch if 0x00 <= ord(ch) <= 0x1F: 

1061 break 

1062 case "\x7f": 

1063 break 

1064 case " ": 

1065 url = self._text[start : self._pos] 

1066 break 

1067 case "(": 

1068 paren_level += 1 

1069 self._pos += 1 

1070 case ")": 

1071 paren_level -= 1 

1072 if paren_level == 0: 

1073 url = self._text[start : self._pos] 

1074 break 

1075 else: 

1076 self._pos += 1 

1077 case _: 

1078 self._pos += 1 

1079 if not url: 

1080 # Empty url is not allowed in this case. 

1081 url = None 

1082 return url 

1083 

1084 def _parse_title(self): 

1085 self._eat(" ") 

1086 if self._ch_eq(self._pos, ")"): 

1087 self._pos += 1 

1088 return "" # Empty title is ok. 

1089 elif self._ch_eq(self._pos, "'"): 

1090 self._pos += 1 

1091 end_char = "'" 

1092 elif self._ch_eq(self._pos, '"'): 

1093 self._pos += 1 

1094 end_char = '"' 

1095 elif self._ch_eq(self._pos, "("): 

1096 self._pos += 1 

1097 end_char = ")" 

1098 else: 

1099 return None # Title parsing failed. 

1100 start = self._pos 

1101 title = None 

1102 while self._fits(self._pos): 

1103 match self._text[self._pos]: 

1104 case "\\" if self._ch_in(self._pos + 1, string.punctuation): 

1105 self._pos += 2 

1106 case ch if ch == end_char: 

1107 title = self._text[start : self._pos] 

1108 self._pos += 1 

1109 break 

1110 case _: 

1111 self._pos += 1 

1112 if self._ch_eq(self._pos, ")"): 

1113 self._pos += 1 

1114 else: 

1115 return None # Href does not end with expected symbol. 

1116 return title 

1117 

1118 def _parse_anchor(self): 

1119 start = self._pos 

1120 while self._fits(self._pos): 

1121 match self._text[self._pos]: 

1122 case "\\" if self._ch_in(self._pos + 1, string.punctuation): 

1123 self._pos += 2 

1124 case "]": 

1125 self._pos += 1 

1126 return self._text[start : self._pos - 1] 

1127 case _: 

1128 self._pos += 1 

1129 return None 

1130 

1131 def _parse_delim_run(self): 

1132 start = self._pos 

1133 ch = self._text[self._pos] 

1134 self._eat(ch) 

1135 

1136 char_before = self._ch_at(start - 1) 

1137 char_after = self._ch_at(self._pos) 

1138 

1139 left_flanking = not char_after.isspace() and ( 

1140 char_after not in string.punctuation 

1141 or char_before.isspace() 

1142 or char_before in string.punctuation 

1143 ) 

1144 

1145 right_flanking = not char_before.isspace() and ( 

1146 char_before not in string.punctuation 

1147 or char_after.isspace() 

1148 or char_after in string.punctuation 

1149 ) 

1150 

1151 if ch == "*": 

1152 can_open = left_flanking 

1153 can_close = right_flanking 

1154 else: # "_" 

1155 can_open = left_flanking and ( 

1156 not right_flanking or (char_before in string.punctuation) 

1157 ) 

1158 can_close = right_flanking and ( 

1159 not left_flanking or (char_after in string.punctuation) 

1160 ) 

1161 

1162 if can_open or can_close: 

1163 self._tokens.append( 

1164 _Token(start, self._pos, ch, can_open=can_open, can_close=can_close) 

1165 ) 

1166 self._push_delim(-1) 

1167 else: 

1168 self._tokens.append(_Token(start, self._pos, "text")) 

1169 

1170 def _push_delim(self, idx: int): 

1171 if idx == -1: 

1172 idx += len(self._tokens) 

1173 assert idx >= 0 

1174 assert self._tokens[idx].kind in "*_" 

1175 assert self._tokens[idx].prev_delim == -1 

1176 assert self._tokens[idx].next_delim == -1 

1177 

1178 if self._delim_last == -1: 

1179 self._delim_last = self._delim_first = idx 

1180 else: 

1181 self._tokens[self._delim_last].next_delim = idx 

1182 self._tokens[idx].prev_delim = self._delim_last 

1183 self._delim_last = idx 

1184 

1185 def _remove_delim(self, idx: int): 

1186 tok = self._tokens[idx] 

1187 if tok.prev_delim == -1: 

1188 self._delim_first = tok.next_delim 

1189 else: 

1190 self._tokens[tok.prev_delim].next_delim = tok.next_delim 

1191 if tok.next_delim == -1: 

1192 self._delim_last = tok.prev_delim 

1193 else: 

1194 self._tokens[tok.next_delim].prev_delim = tok.prev_delim 

1195 

1196 def _next_delim(self, idx: int): 

1197 if idx == -1: 

1198 return self._delim_first 

1199 else: 

1200 return self._tokens[idx].next_delim 

1201 

1202 def _prev_delim(self, idx: int): 

1203 if idx == -1: 

1204 return self._delim_last 

1205 else: 

1206 return self._tokens[idx].prev_delim 

1207 

1208 def _process_delims(self, first_delim: int = -1): 

1209 if first_delim == -1: 

1210 bottom_idx = -1 

1211 else: 

1212 for i in range(first_delim, len(self._tokens)): 

1213 if self._tokens[i].kind in "*_": 

1214 bottom_idx = self._prev_delim(i) 

1215 break 

1216 else: 

1217 bottom_idx = -1 

1218 

1219 openers_bottom_idxs = { 

1220 ("*", 0, False): bottom_idx, 

1221 ("*", 1, False): bottom_idx, 

1222 ("*", 2, False): bottom_idx, 

1223 ("*", 0, True): bottom_idx, 

1224 ("*", 1, True): bottom_idx, 

1225 ("*", 2, True): bottom_idx, 

1226 ("_", 0, False): bottom_idx, 

1227 ("_", 1, False): bottom_idx, 

1228 ("_", 2, False): bottom_idx, 

1229 ("_", 0, True): bottom_idx, 

1230 ("_", 1, True): bottom_idx, 

1231 ("_", 2, True): bottom_idx, 

1232 } 

1233 

1234 current_idx = self._next_delim(bottom_idx) 

1235 while True: 

1236 while current_idx != -1 and not self._tokens[current_idx].can_close: 

1237 current_idx = self._next_delim(current_idx) 

1238 if current_idx == -1: 

1239 break 

1240 # Current is a potential closer, find a matching opener for it. 

1241 current = self._tokens[current_idx] 

1242 bottom_idx_for_current = max( 

1243 bottom_idx, 

1244 openers_bottom_idxs[(current.kind, current.len % 3, current.can_open)], 

1245 ) 

1246 

1247 opener_idx = self._prev_delim(current_idx) 

1248 while opener_idx > bottom_idx_for_current: 

1249 opener = self._tokens[opener_idx] 

1250 

1251 # "If one of the delimiters can both open and close emphasis, 

1252 # then the sum of the lengths of the delimiter runs containing 

1253 # the opening and closing delimiters must not be a multiple 

1254 # of 3 unless both lengths are multiples of 3." 

1255 # 

1256 # See https://spec.commonmark.org/0.31.2/#emphasis-and-strong-emphasis. 

1257 if ( 

1258 opener.can_open 

1259 and opener.kind == current.kind 

1260 and ( 

1261 # None or the delimiters can open and close at the same time... 

1262 not (opener.can_close or current.can_open) 

1263 # ...or sum of their lengths is not a multiple of 3... 

1264 or (opener.len + current.len) % 3 != 0 

1265 # ...or both lengths are multiples of 3. 

1266 or not (opener.len % 3 != 0 or current.len % 3 != 0) 

1267 ) 

1268 ): 

1269 # Found an opener for current. 

1270 is_strong = opener.len >= 2 and current.len >= 2 

1271 

1272 data_key = "strong" if is_strong else "em" 

1273 opener.data.setdefault(data_key, 0) 

1274 opener.data[data_key] += 1 

1275 current.data.setdefault(data_key, 0) 

1276 current.data[data_key] -= 1 

1277 

1278 opener.next_delim = current_idx 

1279 current.prev_delim = opener_idx 

1280 

1281 opener.len -= 1 + is_strong 

1282 if not opener.len: 

1283 self._remove_delim(opener_idx) 

1284 

1285 current.len -= 1 + is_strong 

1286 next_idx = current_idx 

1287 if not current.len: 

1288 next_idx = self._next_delim(current_idx) 

1289 self._remove_delim(current_idx) 

1290 

1291 current_idx = next_idx 

1292 

1293 break 

1294 else: 

1295 opener_idx = self._prev_delim(opener_idx) 

1296 else: 

1297 # No opener for current. 

1298 openers_bottom_idxs[ 

1299 (current.kind, current.len % 3, current.can_open) 

1300 ] = self._prev_delim(current_idx) 

1301 next_idx = self._next_delim(current_idx) 

1302 if not current.can_open: 

1303 self._remove_delim(current_idx) 

1304 current_idx = next_idx 

1305 

1306 

1307def parse(text: str, /, *, dedent: bool = True) -> yuio.doc.Document: 

1308 """ 

1309 Parse a markdown document and return an AST node. 

1310 

1311 :param text: 

1312 text to parse. Common indentation will be removed from this string, 

1313 making it suitable to use with triple quote literals. 

1314 :param dedent: 

1315 remove lading indent from `text`. 

1316 :returns: 

1317 parsed AST node. 

1318 

1319 """ 

1320 

1321 if dedent: 

1322 text = _dedent(text) 

1323 

1324 return MdParser().parse(text)