"""LaTeX to Discord Formatting Converter.
Converts LaTeX/math formatting to Discord-friendly text with Unicode symbols.
Strategy:
- Preserve fenced and inline code as-is.
- Convert display math to plain text (no code fences).
- Convert inline math to inline code with lightweight symbol mapping.
- Replace common LaTeX commands with Unicode equivalents.
- Handle nested ``\\frac{}{}``, ``\\sqrt{}``, superscript/subscript to Unicode.
- Support matrix environments and special math fonts.
"""
import re
import logging
from typing import Tuple
logger = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# Unicode mapping tables (module-level for reuse)
# ---------------------------------------------------------------------------
LATEX_TO_UNICODE = {
# Greek lowercase
"alpha": "\u03b1",
"beta": "\u03b2",
"gamma": "\u03b3",
"delta": "\u03b4",
"epsilon": "\u03b5",
"zeta": "\u03b6",
"eta": "\u03b7",
"theta": "\u03b8",
"iota": "\u03b9",
"kappa": "\u03ba",
"lambda": "\u03bb",
"mu": "\u03bc",
"nu": "\u03bd",
"xi": "\u03be",
"pi": "\u03c0",
"rho": "\u03c1",
"sigma": "\u03c3",
"tau": "\u03c4",
"upsilon": "\u03c5",
"phi": "\u03c6",
"chi": "\u03c7",
"psi": "\u03c8",
"omega": "\u03c9",
# Greek uppercase
"Gamma": "\u0393",
"Delta": "\u0394",
"Theta": "\u0398",
"Lambda": "\u039b",
"Xi": "\u039e",
"Pi": "\u03a0",
"Sigma": "\u03a3",
"Upsilon": "\u03a5",
"Phi": "\u03a6",
"Psi": "\u03a8",
"Omega": "\u03a9",
# Greek variants
"varepsilon": "\u03b5",
"vartheta": "\u03d1",
"varpi": "\u03d6",
"varrho": "\u03f1",
"varsigma": "\u03c2",
"varphi": "\u03c6",
# Operators and misc
"times": "\u00d7",
"cdot": "\u00b7",
"pm": "\u00b1",
"mp": "\u2213",
"leq": "\u2264",
"geq": "\u2265",
"neq": "\u2260",
"approx": "\u2248",
"sim": "\u223c",
"infty": "\u221e",
"infinity": "\u221e",
"to": "\u2192",
"rightarrow": "\u2192",
"leftarrow": "\u2190",
"Rightarrow": "\u21d2",
"Leftarrow": "\u21d0",
"Leftrightarrow": "\u21d4",
# Arrows (Knuth up-arrow / tetration notation lives here: a \uparrow\uparrow n)
"leftrightarrow": "\u2194",
"uparrow": "\u2191",
"downarrow": "\u2193",
"updownarrow": "\u2195",
"Uparrow": "\u21d1",
"Downarrow": "\u21d3",
"Updownarrow": "\u21d5",
"longleftarrow": "\u27f5",
"longrightarrow": "\u27f6",
"longleftrightarrow": "\u27f7",
"Longleftarrow": "\u27f8",
"Longrightarrow": "\u27f9",
"Longleftrightarrow": "\u27fa",
"mapsto": "\u21a6",
"longmapsto": "\u27fc",
"hookleftarrow": "\u21a9",
"hookrightarrow": "\u21aa",
"nearrow": "\u2197",
"searrow": "\u2198",
"swarrow": "\u2199",
"nwarrow": "\u2196",
"nleftarrow": "\u219a",
"nrightarrow": "\u219b",
"nleftrightarrow": "\u21ae",
"nLeftarrow": "\u21cd",
"nRightarrow": "\u21cf",
"nLeftrightarrow": "\u21ce",
"leftleftarrows": "\u21c7",
"rightrightarrows": "\u21c9",
"leftrightarrows": "\u21c6",
"rightleftarrows": "\u21c4",
"leftharpoonup": "\u21bc",
"leftharpoondown": "\u21bd",
"rightharpoonup": "\u21c0",
"rightharpoondown": "\u21c1",
"upharpoonright": "\u21be",
"upharpoonleft": "\u21bf",
"downharpoonright": "\u21c2",
"downharpoonleft": "\u21c3",
"leftrightharpoons": "\u21cb",
"rightleftharpoons": "\u21cc",
"partial": "\u2202",
"nabla": "\u2207",
"sum": "\u2211",
"prod": "\u220f",
"int": "\u222b",
"forall": "\u2200",
"exists": "\u2203",
"neg": "\u00ac",
"lor": "\u2228",
"land": "\u2227",
"oplus": "\u2295",
"otimes": "\u2297",
# Set theory
"in": "\u2208",
"notin": "\u2209",
"subset": "\u2282",
"supset": "\u2283",
"subseteq": "\u2286",
"supseteq": "\u2287",
"cup": "\u222a",
"cap": "\u2229",
"emptyset": "\u2205",
"varnothing": "\u2205",
"setminus": "\u2216",
"complement": "\u2201",
# Logic
"implies": "\u21d2",
"iff": "\u21d4",
"equiv": "\u2261",
"therefore": "\u2234",
"because": "\u2235",
# Relations
"cong": "\u2245",
"ncong": "\u2247",
"propto": "\u221d",
"prec": "\u227a",
"succ": "\u227b",
"preceq": "\u2aaf",
"succeq": "\u2ab0",
"parallel": "\u2225",
"perp": "\u22a5",
"asymp": "\u224d",
# Dots
"ldots": "...",
"cdots": "\u22ef",
"vdots": "\u22ee",
"ddots": "\u22f1",
"dots": "...",
# Brackets and delimiters
"langle": "\u27e8",
"rangle": "\u27e9",
"lfloor": "\u230a",
"rfloor": "\u230b",
"lceil": "\u2308",
"rceil": "\u2309",
"lbrace": "{",
"rbrace": "}",
# Trigonometric functions (preserve as-is)
"sin": "sin",
"cos": "cos",
"tan": "tan",
"cot": "cot",
"sec": "sec",
"csc": "csc",
"arcsin": "arcsin",
"arccos": "arccos",
"arctan": "arctan",
"sinh": "sinh",
"cosh": "cosh",
"tanh": "tanh",
"coth": "coth",
"sech": "sech",
"csch": "csch",
# Logarithmic and exponential
"ln": "ln",
"log": "log",
"exp": "exp",
"lg": "lg",
# Mathematical functions
"det": "det",
"dim": "dim",
"ker": "ker",
"deg": "deg",
"gcd": "gcd",
"lcm": "lcm",
"arg": "arg",
"hom": "hom",
"Pr": "Pr",
"mod": "mod",
# Limits and bounds
"lim": "lim",
"sup": "sup",
"inf": "inf",
"max": "max",
"min": "min",
"limsup": "lim sup",
"liminf": "lim inf",
# Other symbols
"ell": "\u2113",
"hbar": "\u210f",
"Re": "\u211c",
"Im": "\u2111",
"wp": "\u2118",
"angle": "\u2220",
"triangle": "\u25b3",
"square": "\u25a1",
"diamond": "\u25c7",
"star": "\u22c6",
"dagger": "\u2020",
"ddagger": "\u2021",
}
MATHBB_MAP = {
"R": "\u211d",
"C": "\u2102",
"N": "\u2115",
"Z": "\u2124",
"Q": "\u211a",
"P": "\u2119",
"A": "\U0001d538",
"E": "\U0001d53c",
"H": "\u210d",
}
MATHCAL_MAP = {
"A": "\U0001d49c",
"B": "\U0001d435",
"C": "\U0001d49e",
"D": "\U0001d49f",
"E": "\U0001d438",
"F": "\U0001d439",
"G": "\U0001d4a2",
"H": "\U0001d43b",
"I": "\U0001d43c",
"J": "\U0001d4a5",
"K": "\U0001d4a6",
"L": "\U0001d43f",
"M": "\U0001d440",
"N": "\U0001d4a9",
"O": "\U0001d4aa",
"P": "\U0001d4ab",
"Q": "\U0001d4ac",
"R": "\U0001d445",
"S": "\U0001d4ae",
"T": "\U0001d4af",
"U": "\U0001d4b0",
"V": "\U0001d4b1",
"W": "\U0001d4b2",
"X": "\U0001d4b3",
"Y": "\U0001d4b4",
"Z": "\U0001d4b5",
}
SUPERSCRIPT_MAP = {
"0": "\u2070",
"1": "\u00b9",
"2": "\u00b2",
"3": "\u00b3",
"4": "\u2074",
"5": "\u2075",
"6": "\u2076",
"7": "\u2077",
"8": "\u2078",
"9": "\u2079",
"+": "\u207a",
"-": "\u207b",
"(": "\u207d",
")": "\u207e",
"=": "\u207c",
"n": "\u207f",
"i": "\u2071",
}
SUBSCRIPT_MAP = {
"0": "\u2080",
"1": "\u2081",
"2": "\u2082",
"3": "\u2083",
"4": "\u2084",
"5": "\u2085",
"6": "\u2086",
"7": "\u2087",
"8": "\u2088",
"9": "\u2089",
"+": "\u208a",
"-": "\u208b",
"(": "\u208d",
")": "\u208e",
"=": "\u208c",
}
# ---------------------------------------------------------------------------
# Public API
# ---------------------------------------------------------------------------
[docs]
def convert_latex_to_discord(text: str) -> str:
r"""Convert common LaTeX/math formatting to Discord-friendly text.
The module's sole public entry point. Preserves fenced code blocks and
inline code spans untouched; everything else is scanned for LaTeX delimiters
and rewritten into Unicode symbols so the bot's replies render cleanly in
Discord, which has no native math support. Cheaply bails out when the text
contains no LaTeX delimiters at all, and on any conversion error returns the
original text unchanged (logging a warning) so a malformed expression can
never blank out a reply. Delegates the real work to :func:`_do_convert`.
Called by ``postprocess_response`` in ``response_postprocessor.py`` as one
stage of the outbound message-formatting pipeline (after raw-LaTeX wrapping,
before table conversion).
Args:
text: The candidate reply text, possibly containing LaTeX math and
formatting commands.
Returns:
str: The converted text, or the input unchanged when it is empty, not a
string, contains no LaTeX, or conversion raised.
"""
if not isinstance(text, str) or not text:
return text or ""
if not re.search(r"(\\\[|\\\(|\\begin\{|\$\$|\$)", text):
return text
try:
return _do_convert(text)
except Exception as e:
logger.warning("LaTeX conversion failed: %s", e)
return text
# ---------------------------------------------------------------------------
# Internal implementation
# ---------------------------------------------------------------------------
def _do_convert(text: str) -> str:
"""Split text on fenced code blocks and convert only the non-code parts.
The top-level driver behind :func:`convert_latex_to_discord`. Walks the
triple-backtick fenced blocks with a regex, emitting each fenced block
verbatim while routing the surrounding prose through
:func:`_transform_non_code`. This guarantees code samples (which may contain
literal LaTeX-looking characters) are never mangled. Pure string assembly
with no I/O.
Called by :func:`convert_latex_to_discord` once the input has passed the
cheap "has any LaTeX" gate; it is a module-private helper.
Args:
text: The full message text known to contain at least one LaTeX
delimiter.
Returns:
str: The reassembled text with non-code segments converted and fenced
code blocks preserved.
"""
out_parts: list[str] = []
fence_pattern = re.compile(r"```[\s\S]*?```", re.MULTILINE)
last_end = 0
for m in fence_pattern.finditer(text):
before = text[last_end : m.start()]
if before:
out_parts.append(_transform_non_code(before))
out_parts.append(m.group(0))
last_end = m.end()
tail = text[last_end:]
if tail:
out_parts.append(_transform_non_code(tail))
return "".join(out_parts)
def _transform_non_code(segment: str) -> str:
"""Convert a non-fenced segment while preserving inline code spans.
Operates one level below :func:`_do_convert`: it isolates single-backtick
inline code spans (left untouched) and runs everything between them through
:func:`_transform_text_styles` (bold/italic/underline commands) followed by
:func:`_transform_math_segments` (display and inline math). Pure string work
with no I/O.
Called by :func:`_do_convert` for each stretch of text that lies outside a
fenced code block; it is a module-private helper.
Args:
segment: A run of message text that is not inside a fenced code block but
may still contain inline code spans and LaTeX.
Returns:
str: The segment with text styles and math converted and inline code
spans preserved verbatim.
"""
pieces: list[str] = []
idx = 0
for im in re.finditer(r"`[^`]*`", segment):
part = segment[idx : im.start()]
part = _transform_text_styles(part)
part = _transform_math_segments(part)
pieces.append(part)
pieces.append(im.group(0))
idx = im.end()
part = segment[idx:]
part = _transform_text_styles(part)
part = _transform_math_segments(part)
pieces.append(part)
return "".join(pieces)
# ---------------------------------------------------------------------------
# Math segment transforms
# ---------------------------------------------------------------------------
def _transform_math_segments(s: str) -> str:
# Display math: \[...\]
"""Find every math delimiter in a segment and convert its contents.
Sweeps the segment for each supported math wrapper -- display ``\\[...\\]``,
the equation/align/gather/multline environments, ``$$...$$``, inline
``\\(...\\)``, and inline ``$...$`` -- and routes the inner expression
through :func:`_process_math`. Display math is emitted as plain text while
inline math is wrapped in backticks; the ambiguous single-dollar case is
delegated to :func:`_repl_inline_dollar` to avoid eating currency/prose.
Pure regex string work with no I/O.
Called by :func:`_transform_non_code` after text-style conversion; it is a
module-private helper.
Args:
s: The text (outside code spans) whose math delimiters should be
converted.
Returns:
str: The text with all recognized math regions replaced by their
Unicode-converted equivalents.
"""
s = re.sub(
r"\\\[\s*([\s\S]*?)\s*\\\]",
lambda m: _process_math(m.group(1).strip()),
s,
)
# equation, align, gather, multline environments
s = re.sub(
r"\\begin\{(equation\*?|align\*?|gather\*?|multline\*?)\}"
r"\s*([\s\S]*?)\s*\\end\{\1\}",
lambda m: _process_math(m.group(2).strip()),
s,
)
# $$...$$
s = re.sub(
r"\$\$\s*([\s\S]*?)\s*\$\$",
lambda m: _process_math(m.group(1).strip()),
s,
)
# Inline \(...\)
s = re.sub(
r"\\\(\s*([\s\S]*?)\s*\\\)",
lambda m: f"`{_process_math(m.group(1))}`",
s,
)
# Inline $...$ -- enforce CommonMark/pandoc dollar-math delimiter rules so
# currency ("$5", "$5 and $10", "$5-$10") is never mistaken for math:
# * the opening "$" is followed by a non-space, non-"$" character;
# * the closing "$" is preceded by a non-space, non-"$" character;
# * the closing "$" is not immediately followed by a digit (so currency
# ranges like "$5-$10" stay intact);
# * the span contains neither "$" nor a newline.
# These positional rules -- not content guessing -- are what separate inline
# math from money, which is why a factorial like "$7!!!$" or a parenthesized
# expression like "$(5040!)!$" now converts cleanly instead of leaking its
# raw delimiters into the chat UI.
s = re.sub(
r"(?<![\\$])\$(?=[^\s$])([^$\n]*?)(?<=[^\s$])\$(?!\d)",
_repl_inline_dollar,
s,
)
return s
def _repl_inline_dollar(m: re.Match) -> str:
"""Convert a positionally-validated ``$...$`` inline-math span to Unicode.
The single-dollar delimiter is genuinely ambiguous in chat -- ``$5`` and
``$5 to $10`` are money, while ``$x^2$`` and ``$7!!!$`` are math. The
disambiguation lives in the *delimiter* rules enforced by the calling regex
in :func:`_transform_math_segments` (opening ``$`` followed by a non-space
character, closing ``$`` preceded by a non-space character and not followed
by a digit), which mirror the CommonMark/pandoc dollar-math convention and
reliably exclude currency without inspecting the content. By the time this
callback runs the span is already known to be a real inline-math pair, so it
converts the inner text via :func:`_process_math` and wraps it in backticks.
A single conservative content guard remains: if the span is multi-word
natural-language prose with no math content at all (no backslash command,
digit, operator, or bracket), it is left exactly as written rather than
wrapped in a code span. Pure string classification with no I/O.
Passed as the replacement callable to ``re.sub`` inside
:func:`_transform_math_segments`; it has no other callers.
Args:
m: The regex match whose group 1 is the text between the dollar signs.
Returns:
str: A backtick-wrapped, Unicode-converted expression for genuine inline
math, or the original ``$...$`` span unchanged when the content is
operator-free multi-word prose.
"""
inner = m.group(1)
has_math = "\\" in inner or any(
ch.isdigit() or ch in "^_=+*/<>!|(){}[]" for ch in inner
)
if " " in inner.strip() and not has_math:
return m.group(0)
return f"`{_process_math(inner)}`"
def _transform_text_styles(s: str) -> str:
"""Map LaTeX text-formatting commands onto Discord markdown.
Converts the formatting (not math) commands -- ``\\textbf``/``\\mathbf`` to
``**bold**``, ``\\textit``/``\\mathit``/``\\emph`` to ``*italic*``,
``\\underline`` to ``__underline__``, and ``\\texttt``/``\\mathtt`` to inline
code -- so styled spans survive into Discord's markdown. Runs on prose
outside math regions; pure regex string work with no I/O.
Called by :func:`_transform_non_code` before math conversion; it is a
module-private helper.
Args:
s: The text segment (outside code spans) to scan for style commands.
Returns:
str: The text with recognized LaTeX style commands rewritten as Discord
markdown.
"""
s = re.sub(r"\\(?:textbf|mathbf)\{([^{}]+)\}", r"**\1**", s)
s = re.sub(r"\\(?:textit|mathit|emph)\{([^{}]+)\}", r"*\1*", s)
s = re.sub(r"\\underline\{([^{}]+)\}", r"__\1__", s)
s = re.sub(r"\\(?:texttt|mathtt)\{([^{}]+)\}", r"`\1`", s)
return s
# ---------------------------------------------------------------------------
# Core math processing pipeline
# ---------------------------------------------------------------------------
def _process_math(content: str) -> str:
"""Run the full math-conversion pipeline on one extracted expression.
The ordered core of the converter: it applies, in sequence,
:func:`_convert_matrix_environments`, :func:`_convert_limits_and_bounds`,
:func:`_convert_nested_structures` (``\\frac``/``\\sqrt``),
:func:`_apply_symbol_map` (commands, accents, fonts), :func:`_convert_sup_sub`
(super/subscripts), and finally :func:`_final_cleanup`. Order matters --
e.g. matrices and bounded operators must be handled before the generic symbol
map strips backslashes. Pure string transformation with no I/O.
Called by :func:`_transform_math_segments` and :func:`_repl_inline_dollar`
for the inner text of each math region; it is a module-private helper.
Args:
content: The raw LaTeX expression (delimiters already stripped) to
convert.
Returns:
str: The expression rendered using Unicode symbols and plain text.
"""
content = _convert_matrix_environments(content)
content = _convert_limits_and_bounds(content)
content = _convert_nested_structures(content)
content = _apply_symbol_map(content)
content = _convert_sup_sub(content)
content = _final_cleanup(content)
return content
# ---------------------------------------------------------------------------
# Brace extraction
# ---------------------------------------------------------------------------
def _extract_braced(s: str, start: int) -> Tuple[str, int]:
"""Extract one balanced ``{...}`` group, honoring nesting and escapes.
A small brace-matching scanner used wherever the converter must grab a
LaTeX argument that may itself contain braces (e.g. nested ``\\frac``). It
tracks brace depth and skips backslash-escaped characters so the matching
close brace is found correctly. Pure string scanning with no I/O.
Called by :func:`_convert_nested_structures` (for ``\\frac``/``\\sqrt``
arguments) and :func:`_extract_bounds` (for ``_{...}``/``^{...}`` bounds); it
is a module-private helper.
Args:
s: The string being scanned.
start: Index of the opening brace; if ``s[start]`` is not ``{`` the call
is a no-op failure.
Returns:
Tuple[str, int]: A pair of the inner content and the index just past the
matching close brace; on failure returns an empty string and the
unchanged ``start`` index.
"""
if start >= len(s) or s[start] != "{":
return "", start
depth = 0
pos = start
while pos < len(s):
if s[pos] == "{":
depth += 1
elif s[pos] == "}":
depth -= 1
if depth == 0:
return s[start + 1 : pos], pos + 1
elif s[pos] == "\\" and pos + 1 < len(s):
pos += 1
pos += 1
return "", start
# ---------------------------------------------------------------------------
# Symbol map + accents
# ---------------------------------------------------------------------------
def _apply_symbol_map(s: str) -> str:
"""Replace LaTeX commands, fonts, and accents with Unicode equivalents.
The broadest conversion stage: it rewrites blackboard-bold (``\\mathbb``),
calligraphic (``\\mathcal``), and fraktur fonts; combining accents
(``\\hat``/``\\bar``/``\\tilde``/``\\vec``/``\\dot``/``\\ddot``); binomials and
``\\pmod``; literal braces; ``\\text``-family wrappers; line breaks and
``\\left``/``\\right`` sizing; and finally maps every remaining
``\\command`` through ``LATEX_TO_UNICODE`` (leaving unknown commands intact).
Several nested closures handle the regex-callback cases. Pure string work
with no I/O; reads the module-level ``MATHBB_MAP``/``MATHCAL_MAP``/
``LATEX_TO_UNICODE`` tables.
Called by :func:`_process_math` as the symbol-mapping stage of the pipeline;
it is a module-private helper.
Args:
s: The partially-converted expression to run the symbol map over.
Returns:
str: The expression with commands, fonts, and accents converted to
Unicode.
"""
def _repl_mathbb(m: re.Match) -> str:
"""Map a ``\\mathbb{X}`` capture to its blackboard-bold Unicode glyph.
Looks ``m.group(1)`` up in ``MATHBB_MAP``, falling back to a bracketed
placeholder for letters that have no precomposed glyph. Used as a
``re.sub`` callback inside :func:`_apply_symbol_map`.
"""
return MATHBB_MAP.get(m.group(1), f"\U0001d539\U0001d539[{m.group(1)}]")
def _repl_mathcal(m: re.Match) -> str:
"""Map a ``\\mathcal{X}`` capture to its calligraphic Unicode glyph.
Looks ``m.group(1)`` up in ``MATHCAL_MAP``, falling back to a bracketed
placeholder when no glyph exists. Used as a ``re.sub`` callback inside
:func:`_apply_symbol_map`.
"""
return MATHCAL_MAP.get(
m.group(1), f"\U0001d4d2\U0001d4d0\U0001d4db[{m.group(1)}]"
)
s = re.sub(r"\\mathbb\{([A-Z])\}", _repl_mathbb, s)
s = re.sub(r"\\mathcal\{([A-Z])\}", _repl_mathcal, s)
s = re.sub(
r"\\mathfrak\{([^{}]+)\}",
lambda m: f"\U0001d509\U0001d52f\U0001d51e\U0001d528[{m.group(1)}]",
s,
)
# Accents
s = re.sub(r"\\hat\{([^{}]+)\}", "\\1\u0302", s)
s = re.sub(r"\\bar\{([^{}]+)\}", "\\1\u0304", s)
s = re.sub(r"\\tilde\{([^{}]+)\}", "\\1\u0303", s)
s = re.sub(r"\\vec\{([^{}]+)\}", "\\1\u20d7", s)
def _repl_dot(m: re.Match) -> str:
"""Render a ``\\dot{x}`` capture as a single combining-dot accent.
Appends the combining dot-above to a lone letter, but degrades to a
readable ``{c}-dot`` suffix for multi-character content that cannot carry
a combining mark. Used as a ``re.sub`` callback inside
:func:`_apply_symbol_map`.
"""
c = m.group(1)
return (c + "\u0307") if len(c) == 1 and c.isalpha() else f"{c}-dot"
def _repl_ddot(m: re.Match) -> str:
"""Render a ``\\ddot{x}`` capture as a combining-diaeresis accent.
Appends the combining diaeresis to a lone letter, degrading to a
``{c}-ddot`` suffix for multi-character content. Used as a ``re.sub``
callback inside :func:`_apply_symbol_map`.
"""
c = m.group(1)
return (c + "\u0308") if len(c) == 1 and c.isalpha() else f"{c}-ddot"
s = re.sub(r"\\dot\{([^{}]+)\}", _repl_dot, s)
s = re.sub(r"\\ddot\{([^{}]+)\}", _repl_ddot, s)
# Binomial / modulo
s = re.sub(
r"\\binom\{([^{}]+)\}\{([^{}]+)\}",
lambda m: f"C({m.group(1)},{m.group(2)})",
s,
)
s = re.sub(r"\\pmod\{([^{}]+)\}", r"(mod \1)", s)
# Literal braces
s = s.replace(r"\{", "{").replace(r"\}", "}")
# Text wrappers
s = re.sub(r"\\text\(([^()]+)\)", r"\1", s)
s = re.sub(r"\\mathbf\{([^{}]+)\}", r"**\1**", s)
s = re.sub(
r"\\(?:text|mathrm|textrm|mathit|mathsf|mathtt)\{([^{}]+)\}",
r"\1",
s,
)
s = s.replace("\\\\", "\n")
# Strip the \left / \right delimiter-sizing commands only -- the negative
# lookahead stops them from eating the prefix of longer commands that merely
# start with "left"/"right" (e.g. \leftarrow, \rightarrow, \leftrightarrow,
# \leftharpoonup), which must survive to the symbol-map lookup below.
s = re.sub(r"\\left(?![A-Za-z])\s*", "", s)
s = re.sub(r"\\right(?![A-Za-z])\s*", "", s)
def _repl_cmd(m: re.Match) -> str:
"""Map a bare ``\\command`` capture through ``LATEX_TO_UNICODE``.
Returns the Unicode symbol for a known command name, or the original
matched text (``m.group(0)``) unchanged for commands not in the table.
Used as the final ``re.sub`` callback inside :func:`_apply_symbol_map`.
"""
return LATEX_TO_UNICODE.get(m.group(1), m.group(0))
s = re.sub(r"\\([A-Za-z]+)", _repl_cmd, s)
return s
# ---------------------------------------------------------------------------
# Superscript / subscript
# ---------------------------------------------------------------------------
def _convert_sup_sub(s: str) -> str:
"""Convert ``^`` superscripts and ``_`` subscripts to Unicode where possible.
Handles both braced (``^{...}``/``_{...}``) and single-character
(``^2``/``_3``) forms via four nested callbacks, mapping each character
through ``SUPERSCRIPT_MAP``/``SUBSCRIPT_MAP``. When a span contains anything
that lacks a Unicode super/subscript glyph it preserves a readable caret/
underscore fallback rather than dropping characters. Pure string work with no
I/O; reads the module-level super/subscript tables.
Called by :func:`_process_math` after the symbol map; it is a module-private
helper.
Args:
s: The partially-converted expression containing super/subscripts.
Returns:
str: The expression with super/subscripts rendered as Unicode where the
glyphs exist, and as caret/underscore notation otherwise.
"""
def _repl_super_block(m: re.Match) -> str:
"""Convert a braced ``^{...}`` superscript span to Unicode.
Maps each character through ``SUPERSCRIPT_MAP``, but bails out to a
``^{...}`` literal for any span it cannot fully render. Used as a
``re.sub`` callback inside :func:`_convert_sup_sub`.
"""
content = m.group(1)
result = ""
for ch in content:
if ch in SUPERSCRIPT_MAP:
result += SUPERSCRIPT_MAP[ch]
elif ch.isalpha() and len(content) == 1:
result += SUPERSCRIPT_MAP.get(ch, f"^{{{ch}}}") # noqa: E501
else:
return f"^{{{content}}}"
return result
s = re.sub(r"\^\{([^{}]+)\}", _repl_super_block, s)
def _repl_super_single(m: re.Match) -> str:
"""Convert a single-character ``^x`` superscript to Unicode.
Looks the character up in ``SUPERSCRIPT_MAP``, falling back to a literal
``^x`` when no glyph exists. Used as a ``re.sub`` callback inside
:func:`_convert_sup_sub`.
"""
return SUPERSCRIPT_MAP.get(m.group(1), f"^{m.group(1)}")
s = re.sub(r"\^([0-9a-z])", _repl_super_single, s)
def _repl_sub_block(m: re.Match) -> str:
"""Convert a braced ``_{...}`` subscript span to Unicode.
Renders the span via ``SUBSCRIPT_MAP`` only when every character has a
subscript glyph; otherwise preserves a ``_{...}`` literal. Used as a
``re.sub`` callback inside :func:`_convert_sup_sub`.
"""
content = m.group(1)
if all(ch in SUBSCRIPT_MAP for ch in content):
return "".join(SUBSCRIPT_MAP[ch] for ch in content)
return f"_{{{content}}}"
s = re.sub(r"_\{([^{}]+)\}", _repl_sub_block, s)
def _repl_sub_single(m: re.Match) -> str:
"""Convert a single-character ``_x`` subscript to Unicode.
Looks the character up in ``SUBSCRIPT_MAP``, falling back to a literal
``_x`` when no glyph exists. Used as a ``re.sub`` callback inside
:func:`_convert_sup_sub`.
"""
return SUBSCRIPT_MAP.get(m.group(1), f"_{m.group(1)}")
s = re.sub(r"_([0-9])", _repl_sub_single, s)
return s
# ---------------------------------------------------------------------------
# Nested structures (\frac, \sqrt)
# ---------------------------------------------------------------------------
def _is_simple_expression(expr: str) -> bool:
"""Judge whether an expression is atomic enough to skip wrapping in parens.
A formatting heuristic used when laying out fractions and roots: a "simple"
operand (a single token, or one already parenthesized) can be inlined as-is,
while anything containing division, products, big operators, or a mix of
additive and sub/superscript structure must be parenthesized to stay
unambiguous in linear Unicode form. Pure string inspection with no I/O.
Called by :func:`_convert_nested_structures` to decide whether each
``\\frac`` numerator/denominator and ``\\sqrt`` radicand needs surrounding
parentheses; it is a module-private helper.
Args:
expr: The already-converted sub-expression to classify.
Returns:
bool: ``True`` if the expression can be inlined without parentheses,
``False`` if it should be parenthesized.
"""
if not expr:
return True
expr = expr.strip()
if expr.startswith("(") and expr.endswith(")"):
return True
if "/" in expr or "\u00d7" in expr or "\u00b7" in expr:
return False
if "\u2211" in expr or "\u222b" in expr or "\u220f" in expr:
return False
ops = "+" in expr or "-" in expr
subs = "^" in expr or "_" in expr or "\u221a" in expr
if ops and subs:
return False
if re.match(r"^[^\s+\-*/]+$", expr):
return True
return False
def _convert_nested_structures(s: str) -> str:
"""Convert ``\\sqrt{...}`` and ``\\frac{...}{...}`` from the inside out.
Iteratively rewrites roots as ``√(...)`` and fractions as ``num/den``, using
:func:`_extract_braced` to grab balanced arguments and
:func:`_is_simple_expression` to decide when each operand needs parentheses.
The loop (capped at 10 passes) lets nested constructs collapse one layer per
iteration until no further change occurs, so ``\\frac`` inside ``\\frac`` is
handled correctly. Pure string work with no I/O.
Called by :func:`_process_math` before the symbol map runs; it is a
module-private helper.
Args:
s: The expression possibly containing nested roots and fractions.
Returns:
str: The expression with roots and fractions rendered in linear notation.
"""
max_iterations = 10
for _ in range(max_iterations):
changed = False
# \sqrt{...}
pos = 0
result: list[str] = []
while pos < len(s):
match = re.search(r"\\sqrt\{", s[pos:])
if not match:
result.append(s[pos:])
break
result.append(s[pos : pos + match.start()])
brace_start = pos + match.end() - 1
content, end_pos = _extract_braced(s, brace_start)
if content:
sq = "\u221a"
simple = _is_simple_expression(content)
part = content if simple else f"({content})"
result.append(f"{sq}{part}")
pos = end_pos
changed = True
else:
result.append(s[pos : pos + match.end()])
pos += match.end()
s = "".join(result)
# \frac{...}{...}
pos = 0
result = []
while pos < len(s):
match = re.search(r"\\frac\{", s[pos:])
if not match:
result.append(s[pos:])
break
result.append(s[pos : pos + match.start()])
brace_start = pos + match.end() - 1
numerator, next_pos = _extract_braced(s, brace_start)
if not numerator:
result.append(s[pos : pos + match.end()])
pos += match.end()
continue
denominator, end_pos = _extract_braced(s, next_pos)
if not denominator:
result.append(s[pos : pos + match.end()])
pos += match.end()
continue
num_str = (
numerator if _is_simple_expression(numerator) else f"({numerator})"
)
den_str = (
denominator
if _is_simple_expression(denominator)
else f"({denominator})"
)
result.append(f"{num_str}/{den_str}")
pos = end_pos
changed = True
s = "".join(result)
if not changed:
break
return s
# ---------------------------------------------------------------------------
# Limits, sums, integrals, products
# ---------------------------------------------------------------------------
def _convert_limits_and_bounds(s: str) -> str:
"""Convert ``\\lim``, ``\\sum``, ``\\int``, and ``\\prod`` with their bounds.
Rewrites limit expressions as ``lim[...]`` and the big operators to their
Unicode glyphs (∑, ∫, ∏) annotated with any subscript/superscript bounds in a
readable ``[lower to upper]`` form, scanning each operator manually and using
:func:`_extract_bounds` to pull the attached ``_``/``^`` arguments. Done early
so the bounds are captured before the generic symbol map would otherwise
strip the backslash commands. Pure string work with no I/O.
Called by :func:`_process_math` as the second pipeline stage; it is a
module-private helper.
Args:
s: The expression possibly containing bounded operators.
Returns:
str: The expression with limits and big operators converted and their
bounds inlined.
"""
s = re.sub(r"\\lim_\{([^{}]+)\}", lambda m: f"lim[{m.group(1)}]", s)
s = re.sub(r"\\lim", "lim", s)
cmds = [("\\sum", "\u2211"), ("\\int", "\u222b"), ("\\prod", "\u220f")]
for cmd, symbol in cmds:
pos = 0
result: list[str] = []
while pos < len(s):
match = re.search(re.escape(cmd), s[pos:])
if not match:
result.append(s[pos:])
break
result.append(s[pos : pos + match.start()])
bound_start = pos + match.end()
lower, upper, end_pos = _extract_bounds(s, bound_start)
if lower and upper:
result.append(f"{symbol}[{lower} to {upper}]")
elif lower:
result.append(f"{symbol}[{lower}]")
else:
result.append(symbol)
pos = end_pos
s = "".join(result)
return s
def _extract_bounds(s: str, pos: int) -> Tuple[str, str, int]:
"""Read the ``_lower`` and ``^upper`` bounds attached to an operator.
A small scanner that, starting at *pos*, optionally consumes a subscript and
then a superscript, each of which may be a braced group (via
:func:`_extract_braced`), a backslash command, or a single alphanumeric
character. Used to capture the bounds of ``\\sum``/``\\int``/``\\prod`` so
:func:`_convert_limits_and_bounds` can render them inline. Pure string
scanning with no I/O.
Called only by :func:`_convert_limits_and_bounds`; it is a module-private
helper.
Args:
s: The string being scanned.
pos: Index immediately after the operator token, where bounds may begin.
Returns:
Tuple[str, str, int]: The lower bound, the upper bound (either may be an
empty string when absent), and the index just past the consumed bounds.
"""
lower = ""
upper = ""
if pos < len(s) and s[pos] == "_":
pos += 1
if pos < len(s):
if s[pos] == "{":
lower, pos = _extract_braced(s, pos)
elif s[pos] == "\\":
m = re.match(r"\\([A-Za-z]+)", s[pos:])
if m:
lower = "\\" + m.group(1)
pos += len(m.group(0))
elif s[pos].isalnum():
lower = s[pos]
pos += 1
if pos < len(s) and s[pos] == "^":
pos += 1
if pos < len(s):
if s[pos] == "{":
upper, pos = _extract_braced(s, pos)
elif s[pos] == "\\":
m = re.match(r"\\([A-Za-z]+)", s[pos:])
if m:
upper = "\\" + m.group(1)
pos += len(m.group(0))
elif s[pos].isalnum():
upper = s[pos]
pos += 1
return lower, upper, pos
# ---------------------------------------------------------------------------
# Matrix environments
# ---------------------------------------------------------------------------
def _convert_matrix_environments(s: str) -> str:
"""Lay out LaTeX matrix environments as bracketed multi-line text.
Matches the ``pmatrix``/``bmatrix``/``matrix``/``vmatrix``/``Vmatrix``
environments, splits their body on ``\\\\`` row breaks and ``&`` column
separators, and re-emits each as a row-per-line block wrapped in the
delimiter pair appropriate to the environment (parentheses, square brackets,
single or double bars). Done first in the pipeline so the row/column markers
are interpreted before later stages touch them. Pure string work with no
I/O.
Called by :func:`_process_math` as the first pipeline stage; it is a
module-private helper.
Args:
s: The expression possibly containing matrix environments.
Returns:
str: The expression with matrix environments rendered as bracketed
text grids.
"""
matrix_envs = r"(pmatrix|bmatrix|matrix|vmatrix|Vmatrix)"
beg = rf"\\begin\{{{matrix_envs}\}}\s*([\s\S]*?)\s*"
end = rf"\\end\{{{matrix_envs}\}}"
pattern = beg + end
def _repl(m: re.Match) -> str:
"""Format one matched matrix environment into a bracketed text grid.
Splits the captured body into rows and cells and wraps the result in the
delimiter pair matching the environment name. Used as the ``re.sub``
callback inside :func:`_convert_matrix_environments`.
"""
env = m.group(1)
rows = re.split(r"\\\\", m.group(2))
fmt = []
for row in rows:
cells = re.split(r"&", row.strip())
fmt.append(" ".join(c.strip() for c in cells))
body = "\n".join(fmt)
if env == "pmatrix":
return f"(\n{body}\n)"
if env == "bmatrix":
return f"[\n{body}\n]"
if env == "vmatrix":
return f"|\n{body}\n|"
if env == "Vmatrix":
return f"||\n{body}\n||"
return body
return re.sub(pattern, _repl, s)
# ---------------------------------------------------------------------------
# Final cleanup
# ---------------------------------------------------------------------------
def _final_cleanup(s: str) -> str:
"""Strip leftover LaTeX scaffolding and normalize whitespace.
The last pipeline stage: it removes any surviving ``\\begin``/``\\end``
wrappers, drops the backslash from unconverted commands, deletes stray
braces, and collapses redundant spaces and blank lines so the rendered math
reads cleanly. Best-effort tidy-up of whatever earlier stages did not
convert. Pure string work with no I/O.
Called by :func:`_process_math` as the final stage; it is a module-private
helper.
Args:
s: The mostly-converted expression to finalize.
Returns:
str: The cleaned, whitespace-normalized expression.
"""
s = re.sub(r"\\(begin|end)\{[^}]*\}", "", s)
s = re.sub(r"\\([A-Za-z]+)", r"\1", s)
s = re.sub(r"[{}]", "", s)
s = re.sub(r"\s+\n", "\n", s)
s = re.sub(r"\n{3,}", "\n\n", s)
s = re.sub(r" {2,}", " ", s)
return s.strip()