Source code for latex_converter

"""LaTeX to Discord Formatting Converter.

Converts LaTeX/math formatting to Discord-friendly text with Unicode symbols.

Strategy:
- Preserve fenced and inline code as-is.
- Convert display math to plain text (no code fences).
- Convert inline math to inline code with lightweight symbol mapping.
- Replace common LaTeX commands with Unicode equivalents.
- Handle nested ``\\frac{}{}``, ``\\sqrt{}``, superscript/subscript to Unicode.
- Support matrix environments and special math fonts.
"""

import re
import logging
from typing import Tuple

logger = logging.getLogger(__name__)

# ---------------------------------------------------------------------------
# Unicode mapping tables (module-level for reuse)
# ---------------------------------------------------------------------------

LATEX_TO_UNICODE = {
    # Greek lowercase
    "alpha": "\u03b1",
    "beta": "\u03b2",
    "gamma": "\u03b3",
    "delta": "\u03b4",
    "epsilon": "\u03b5",
    "zeta": "\u03b6",
    "eta": "\u03b7",
    "theta": "\u03b8",
    "iota": "\u03b9",
    "kappa": "\u03ba",
    "lambda": "\u03bb",
    "mu": "\u03bc",
    "nu": "\u03bd",
    "xi": "\u03be",
    "pi": "\u03c0",
    "rho": "\u03c1",
    "sigma": "\u03c3",
    "tau": "\u03c4",
    "upsilon": "\u03c5",
    "phi": "\u03c6",
    "chi": "\u03c7",
    "psi": "\u03c8",
    "omega": "\u03c9",
    # Greek uppercase
    "Gamma": "\u0393",
    "Delta": "\u0394",
    "Theta": "\u0398",
    "Lambda": "\u039b",
    "Xi": "\u039e",
    "Pi": "\u03a0",
    "Sigma": "\u03a3",
    "Upsilon": "\u03a5",
    "Phi": "\u03a6",
    "Psi": "\u03a8",
    "Omega": "\u03a9",
    # Greek variants
    "varepsilon": "\u03b5",
    "vartheta": "\u03d1",
    "varpi": "\u03d6",
    "varrho": "\u03f1",
    "varsigma": "\u03c2",
    "varphi": "\u03c6",
    # Operators and misc
    "times": "\u00d7",
    "cdot": "\u00b7",
    "pm": "\u00b1",
    "mp": "\u2213",
    "leq": "\u2264",
    "geq": "\u2265",
    "neq": "\u2260",
    "approx": "\u2248",
    "sim": "\u223c",
    "infty": "\u221e",
    "infinity": "\u221e",
    "to": "\u2192",
    "rightarrow": "\u2192",
    "leftarrow": "\u2190",
    "Rightarrow": "\u21d2",
    "Leftarrow": "\u21d0",
    "Leftrightarrow": "\u21d4",
    # Arrows (Knuth up-arrow / tetration notation lives here: a \uparrow\uparrow n)
    "leftrightarrow": "\u2194",
    "uparrow": "\u2191",
    "downarrow": "\u2193",
    "updownarrow": "\u2195",
    "Uparrow": "\u21d1",
    "Downarrow": "\u21d3",
    "Updownarrow": "\u21d5",
    "longleftarrow": "\u27f5",
    "longrightarrow": "\u27f6",
    "longleftrightarrow": "\u27f7",
    "Longleftarrow": "\u27f8",
    "Longrightarrow": "\u27f9",
    "Longleftrightarrow": "\u27fa",
    "mapsto": "\u21a6",
    "longmapsto": "\u27fc",
    "hookleftarrow": "\u21a9",
    "hookrightarrow": "\u21aa",
    "nearrow": "\u2197",
    "searrow": "\u2198",
    "swarrow": "\u2199",
    "nwarrow": "\u2196",
    "nleftarrow": "\u219a",
    "nrightarrow": "\u219b",
    "nleftrightarrow": "\u21ae",
    "nLeftarrow": "\u21cd",
    "nRightarrow": "\u21cf",
    "nLeftrightarrow": "\u21ce",
    "leftleftarrows": "\u21c7",
    "rightrightarrows": "\u21c9",
    "leftrightarrows": "\u21c6",
    "rightleftarrows": "\u21c4",
    "leftharpoonup": "\u21bc",
    "leftharpoondown": "\u21bd",
    "rightharpoonup": "\u21c0",
    "rightharpoondown": "\u21c1",
    "upharpoonright": "\u21be",
    "upharpoonleft": "\u21bf",
    "downharpoonright": "\u21c2",
    "downharpoonleft": "\u21c3",
    "leftrightharpoons": "\u21cb",
    "rightleftharpoons": "\u21cc",
    "partial": "\u2202",
    "nabla": "\u2207",
    "sum": "\u2211",
    "prod": "\u220f",
    "int": "\u222b",
    "forall": "\u2200",
    "exists": "\u2203",
    "neg": "\u00ac",
    "lor": "\u2228",
    "land": "\u2227",
    "oplus": "\u2295",
    "otimes": "\u2297",
    # Set theory
    "in": "\u2208",
    "notin": "\u2209",
    "subset": "\u2282",
    "supset": "\u2283",
    "subseteq": "\u2286",
    "supseteq": "\u2287",
    "cup": "\u222a",
    "cap": "\u2229",
    "emptyset": "\u2205",
    "varnothing": "\u2205",
    "setminus": "\u2216",
    "complement": "\u2201",
    # Logic
    "implies": "\u21d2",
    "iff": "\u21d4",
    "equiv": "\u2261",
    "therefore": "\u2234",
    "because": "\u2235",
    # Relations
    "cong": "\u2245",
    "ncong": "\u2247",
    "propto": "\u221d",
    "prec": "\u227a",
    "succ": "\u227b",
    "preceq": "\u2aaf",
    "succeq": "\u2ab0",
    "parallel": "\u2225",
    "perp": "\u22a5",
    "asymp": "\u224d",
    # Dots
    "ldots": "...",
    "cdots": "\u22ef",
    "vdots": "\u22ee",
    "ddots": "\u22f1",
    "dots": "...",
    # Brackets and delimiters
    "langle": "\u27e8",
    "rangle": "\u27e9",
    "lfloor": "\u230a",
    "rfloor": "\u230b",
    "lceil": "\u2308",
    "rceil": "\u2309",
    "lbrace": "{",
    "rbrace": "}",
    # Trigonometric functions (preserve as-is)
    "sin": "sin",
    "cos": "cos",
    "tan": "tan",
    "cot": "cot",
    "sec": "sec",
    "csc": "csc",
    "arcsin": "arcsin",
    "arccos": "arccos",
    "arctan": "arctan",
    "sinh": "sinh",
    "cosh": "cosh",
    "tanh": "tanh",
    "coth": "coth",
    "sech": "sech",
    "csch": "csch",
    # Logarithmic and exponential
    "ln": "ln",
    "log": "log",
    "exp": "exp",
    "lg": "lg",
    # Mathematical functions
    "det": "det",
    "dim": "dim",
    "ker": "ker",
    "deg": "deg",
    "gcd": "gcd",
    "lcm": "lcm",
    "arg": "arg",
    "hom": "hom",
    "Pr": "Pr",
    "mod": "mod",
    # Limits and bounds
    "lim": "lim",
    "sup": "sup",
    "inf": "inf",
    "max": "max",
    "min": "min",
    "limsup": "lim sup",
    "liminf": "lim inf",
    # Other symbols
    "ell": "\u2113",
    "hbar": "\u210f",
    "Re": "\u211c",
    "Im": "\u2111",
    "wp": "\u2118",
    "angle": "\u2220",
    "triangle": "\u25b3",
    "square": "\u25a1",
    "diamond": "\u25c7",
    "star": "\u22c6",
    "dagger": "\u2020",
    "ddagger": "\u2021",
}

MATHBB_MAP = {
    "R": "\u211d",
    "C": "\u2102",
    "N": "\u2115",
    "Z": "\u2124",
    "Q": "\u211a",
    "P": "\u2119",
    "A": "\U0001d538",
    "E": "\U0001d53c",
    "H": "\u210d",
}

MATHCAL_MAP = {
    "A": "\U0001d49c",
    "B": "\U0001d435",
    "C": "\U0001d49e",
    "D": "\U0001d49f",
    "E": "\U0001d438",
    "F": "\U0001d439",
    "G": "\U0001d4a2",
    "H": "\U0001d43b",
    "I": "\U0001d43c",
    "J": "\U0001d4a5",
    "K": "\U0001d4a6",
    "L": "\U0001d43f",
    "M": "\U0001d440",
    "N": "\U0001d4a9",
    "O": "\U0001d4aa",
    "P": "\U0001d4ab",
    "Q": "\U0001d4ac",
    "R": "\U0001d445",
    "S": "\U0001d4ae",
    "T": "\U0001d4af",
    "U": "\U0001d4b0",
    "V": "\U0001d4b1",
    "W": "\U0001d4b2",
    "X": "\U0001d4b3",
    "Y": "\U0001d4b4",
    "Z": "\U0001d4b5",
}

SUPERSCRIPT_MAP = {
    "0": "\u2070",
    "1": "\u00b9",
    "2": "\u00b2",
    "3": "\u00b3",
    "4": "\u2074",
    "5": "\u2075",
    "6": "\u2076",
    "7": "\u2077",
    "8": "\u2078",
    "9": "\u2079",
    "+": "\u207a",
    "-": "\u207b",
    "(": "\u207d",
    ")": "\u207e",
    "=": "\u207c",
    "n": "\u207f",
    "i": "\u2071",
}

SUBSCRIPT_MAP = {
    "0": "\u2080",
    "1": "\u2081",
    "2": "\u2082",
    "3": "\u2083",
    "4": "\u2084",
    "5": "\u2085",
    "6": "\u2086",
    "7": "\u2087",
    "8": "\u2088",
    "9": "\u2089",
    "+": "\u208a",
    "-": "\u208b",
    "(": "\u208d",
    ")": "\u208e",
    "=": "\u208c",
}


# ---------------------------------------------------------------------------
# Public API
# ---------------------------------------------------------------------------



[docs]
def convert_latex_to_discord(text: str) -> str:
    r"""Convert common LaTeX/math formatting to Discord-friendly text.

    The module's sole public entry point. Preserves fenced code blocks and
    inline code spans untouched; everything else is scanned for LaTeX delimiters
    and rewritten into Unicode symbols so the bot's replies render cleanly in
    Discord, which has no native math support. Cheaply bails out when the text
    contains no LaTeX delimiters at all, and on any conversion error returns the
    original text unchanged (logging a warning) so a malformed expression can
    never blank out a reply. Delegates the real work to :func:`_do_convert`.

    Called by ``postprocess_response`` in ``response_postprocessor.py`` as one
    stage of the outbound message-formatting pipeline (after raw-LaTeX wrapping,
    before table conversion).

    Args:
        text: The candidate reply text, possibly containing LaTeX math and
            formatting commands.

    Returns:
        str: The converted text, or the input unchanged when it is empty, not a
        string, contains no LaTeX, or conversion raised.
    """
    if not isinstance(text, str) or not text:
        return text or ""

    if not re.search(r"(\\\[|\\\(|\\begin\{|\$\$|\$)", text):
        return text

    try:
        return _do_convert(text)
    except Exception as e:
        logger.warning("LaTeX conversion failed: %s", e)
        return text



# ---------------------------------------------------------------------------
# Internal implementation
# ---------------------------------------------------------------------------


def _do_convert(text: str) -> str:
    """Split text on fenced code blocks and convert only the non-code parts.

    The top-level driver behind :func:`convert_latex_to_discord`. Walks the
    triple-backtick fenced blocks with a regex, emitting each fenced block
    verbatim while routing the surrounding prose through
    :func:`_transform_non_code`. This guarantees code samples (which may contain
    literal LaTeX-looking characters) are never mangled. Pure string assembly
    with no I/O.

    Called by :func:`convert_latex_to_discord` once the input has passed the
    cheap "has any LaTeX" gate; it is a module-private helper.

    Args:
        text: The full message text known to contain at least one LaTeX
            delimiter.

    Returns:
        str: The reassembled text with non-code segments converted and fenced
        code blocks preserved.
    """

    out_parts: list[str] = []
    fence_pattern = re.compile(r"```[\s\S]*?```", re.MULTILINE)
    last_end = 0

    for m in fence_pattern.finditer(text):
        before = text[last_end : m.start()]
        if before:
            out_parts.append(_transform_non_code(before))
        out_parts.append(m.group(0))
        last_end = m.end()

    tail = text[last_end:]
    if tail:
        out_parts.append(_transform_non_code(tail))

    return "".join(out_parts)


def _transform_non_code(segment: str) -> str:
    """Convert a non-fenced segment while preserving inline code spans.

    Operates one level below :func:`_do_convert`: it isolates single-backtick
    inline code spans (left untouched) and runs everything between them through
    :func:`_transform_text_styles` (bold/italic/underline commands) followed by
    :func:`_transform_math_segments` (display and inline math). Pure string work
    with no I/O.

    Called by :func:`_do_convert` for each stretch of text that lies outside a
    fenced code block; it is a module-private helper.

    Args:
        segment: A run of message text that is not inside a fenced code block but
            may still contain inline code spans and LaTeX.

    Returns:
        str: The segment with text styles and math converted and inline code
        spans preserved verbatim.
    """
    pieces: list[str] = []
    idx = 0
    for im in re.finditer(r"`[^`]*`", segment):
        part = segment[idx : im.start()]
        part = _transform_text_styles(part)
        part = _transform_math_segments(part)
        pieces.append(part)
        pieces.append(im.group(0))
        idx = im.end()
    part = segment[idx:]
    part = _transform_text_styles(part)
    part = _transform_math_segments(part)
    pieces.append(part)
    return "".join(pieces)


# ---------------------------------------------------------------------------
# Math segment transforms
# ---------------------------------------------------------------------------


def _transform_math_segments(s: str) -> str:
    # Display math: \[...\]
    """Find every math delimiter in a segment and convert its contents.

    Sweeps the segment for each supported math wrapper -- display ``\\[...\\]``,
    the equation/align/gather/multline environments, ``$$...$$``, inline
    ``\\(...\\)``, and inline ``$...$`` -- and routes the inner expression
    through :func:`_process_math`. Display math is emitted as plain text while
    inline math is wrapped in backticks; the ambiguous single-dollar case is
    delegated to :func:`_repl_inline_dollar` to avoid eating currency/prose.
    Pure regex string work with no I/O.

    Called by :func:`_transform_non_code` after text-style conversion; it is a
    module-private helper.

    Args:
        s: The text (outside code spans) whose math delimiters should be
            converted.

    Returns:
        str: The text with all recognized math regions replaced by their
        Unicode-converted equivalents.
    """
    s = re.sub(
        r"\\\[\s*([\s\S]*?)\s*\\\]",
        lambda m: _process_math(m.group(1).strip()),
        s,
    )
    # equation, align, gather, multline environments
    s = re.sub(
        r"\\begin\{(equation\*?|align\*?|gather\*?|multline\*?)\}"
        r"\s*([\s\S]*?)\s*\\end\{\1\}",
        lambda m: _process_math(m.group(2).strip()),
        s,
    )
    # $$...$$
    s = re.sub(
        r"\$\$\s*([\s\S]*?)\s*\$\$",
        lambda m: _process_math(m.group(1).strip()),
        s,
    )
    # Inline \(...\)
    s = re.sub(
        r"\\\(\s*([\s\S]*?)\s*\\\)",
        lambda m: f"`{_process_math(m.group(1))}`",
        s,
    )
    # Inline $...$ -- enforce CommonMark/pandoc dollar-math delimiter rules so
    # currency ("$5", "$5 and $10", "$5-$10") is never mistaken for math:
    #   * the opening "$" is followed by a non-space, non-"$" character;
    #   * the closing "$" is preceded by a non-space, non-"$" character;
    #   * the closing "$" is not immediately followed by a digit (so currency
    #     ranges like "$5-$10" stay intact);
    #   * the span contains neither "$" nor a newline.
    # These positional rules -- not content guessing -- are what separate inline
    # math from money, which is why a factorial like "$7!!!$" or a parenthesized
    # expression like "$(5040!)!$" now converts cleanly instead of leaking its
    # raw delimiters into the chat UI.
    s = re.sub(
        r"(?<![\\$])\$(?=[^\s$])([^$\n]*?)(?<=[^\s$])\$(?!\d)",
        _repl_inline_dollar,
        s,
    )
    return s


def _repl_inline_dollar(m: re.Match) -> str:
    """Convert a positionally-validated ``$...$`` inline-math span to Unicode.

    The single-dollar delimiter is genuinely ambiguous in chat -- ``$5`` and
    ``$5 to $10`` are money, while ``$x^2$`` and ``$7!!!$`` are math. The
    disambiguation lives in the *delimiter* rules enforced by the calling regex
    in :func:`_transform_math_segments` (opening ``$`` followed by a non-space
    character, closing ``$`` preceded by a non-space character and not followed
    by a digit), which mirror the CommonMark/pandoc dollar-math convention and
    reliably exclude currency without inspecting the content. By the time this
    callback runs the span is already known to be a real inline-math pair, so it
    converts the inner text via :func:`_process_math` and wraps it in backticks.

    A single conservative content guard remains: if the span is multi-word
    natural-language prose with no math content at all (no backslash command,
    digit, operator, or bracket), it is left exactly as written rather than
    wrapped in a code span. Pure string classification with no I/O.

    Passed as the replacement callable to ``re.sub`` inside
    :func:`_transform_math_segments`; it has no other callers.

    Args:
        m: The regex match whose group 1 is the text between the dollar signs.

    Returns:
        str: A backtick-wrapped, Unicode-converted expression for genuine inline
        math, or the original ``$...$`` span unchanged when the content is
        operator-free multi-word prose.
    """
    inner = m.group(1)
    has_math = "\\" in inner or any(
        ch.isdigit() or ch in "^_=+*/<>!|(){}[]" for ch in inner
    )
    if " " in inner.strip() and not has_math:
        return m.group(0)
    return f"`{_process_math(inner)}`"


def _transform_text_styles(s: str) -> str:
    """Map LaTeX text-formatting commands onto Discord markdown.

    Converts the formatting (not math) commands -- ``\\textbf``/``\\mathbf`` to
    ``**bold**``, ``\\textit``/``\\mathit``/``\\emph`` to ``*italic*``,
    ``\\underline`` to ``__underline__``, and ``\\texttt``/``\\mathtt`` to inline
    code -- so styled spans survive into Discord's markdown. Runs on prose
    outside math regions; pure regex string work with no I/O.

    Called by :func:`_transform_non_code` before math conversion; it is a
    module-private helper.

    Args:
        s: The text segment (outside code spans) to scan for style commands.

    Returns:
        str: The text with recognized LaTeX style commands rewritten as Discord
        markdown.
    """
    s = re.sub(r"\\(?:textbf|mathbf)\{([^{}]+)\}", r"**\1**", s)
    s = re.sub(r"\\(?:textit|mathit|emph)\{([^{}]+)\}", r"*\1*", s)
    s = re.sub(r"\\underline\{([^{}]+)\}", r"__\1__", s)
    s = re.sub(r"\\(?:texttt|mathtt)\{([^{}]+)\}", r"`\1`", s)
    return s


# ---------------------------------------------------------------------------
# Core math processing pipeline
# ---------------------------------------------------------------------------


def _process_math(content: str) -> str:
    """Run the full math-conversion pipeline on one extracted expression.

    The ordered core of the converter: it applies, in sequence,
    :func:`_convert_matrix_environments`, :func:`_convert_limits_and_bounds`,
    :func:`_convert_nested_structures` (``\\frac``/``\\sqrt``),
    :func:`_apply_symbol_map` (commands, accents, fonts), :func:`_convert_sup_sub`
    (super/subscripts), and finally :func:`_final_cleanup`. Order matters --
    e.g. matrices and bounded operators must be handled before the generic symbol
    map strips backslashes. Pure string transformation with no I/O.

    Called by :func:`_transform_math_segments` and :func:`_repl_inline_dollar`
    for the inner text of each math region; it is a module-private helper.

    Args:
        content: The raw LaTeX expression (delimiters already stripped) to
            convert.

    Returns:
        str: The expression rendered using Unicode symbols and plain text.
    """
    content = _convert_matrix_environments(content)
    content = _convert_limits_and_bounds(content)
    content = _convert_nested_structures(content)
    content = _apply_symbol_map(content)
    content = _convert_sup_sub(content)
    content = _final_cleanup(content)
    return content


# ---------------------------------------------------------------------------
# Brace extraction
# ---------------------------------------------------------------------------


def _extract_braced(s: str, start: int) -> Tuple[str, int]:
    """Extract one balanced ``{...}`` group, honoring nesting and escapes.

    A small brace-matching scanner used wherever the converter must grab a
    LaTeX argument that may itself contain braces (e.g. nested ``\\frac``). It
    tracks brace depth and skips backslash-escaped characters so the matching
    close brace is found correctly. Pure string scanning with no I/O.

    Called by :func:`_convert_nested_structures` (for ``\\frac``/``\\sqrt``
    arguments) and :func:`_extract_bounds` (for ``_{...}``/``^{...}`` bounds); it
    is a module-private helper.

    Args:
        s: The string being scanned.
        start: Index of the opening brace; if ``s[start]`` is not ``{`` the call
            is a no-op failure.

    Returns:
        Tuple[str, int]: A pair of the inner content and the index just past the
        matching close brace; on failure returns an empty string and the
        unchanged ``start`` index.
    """
    if start >= len(s) or s[start] != "{":
        return "", start
    depth = 0
    pos = start
    while pos < len(s):
        if s[pos] == "{":
            depth += 1
        elif s[pos] == "}":
            depth -= 1
            if depth == 0:
                return s[start + 1 : pos], pos + 1
        elif s[pos] == "\\" and pos + 1 < len(s):
            pos += 1
        pos += 1
    return "", start


# ---------------------------------------------------------------------------
# Symbol map + accents
# ---------------------------------------------------------------------------


def _apply_symbol_map(s: str) -> str:
    """Replace LaTeX commands, fonts, and accents with Unicode equivalents.

    The broadest conversion stage: it rewrites blackboard-bold (``\\mathbb``),
    calligraphic (``\\mathcal``), and fraktur fonts; combining accents
    (``\\hat``/``\\bar``/``\\tilde``/``\\vec``/``\\dot``/``\\ddot``); binomials and
    ``\\pmod``; literal braces; ``\\text``-family wrappers; line breaks and
    ``\\left``/``\\right`` sizing; and finally maps every remaining
    ``\\command`` through ``LATEX_TO_UNICODE`` (leaving unknown commands intact).
    Several nested closures handle the regex-callback cases. Pure string work
    with no I/O; reads the module-level ``MATHBB_MAP``/``MATHCAL_MAP``/
    ``LATEX_TO_UNICODE`` tables.

    Called by :func:`_process_math` as the symbol-mapping stage of the pipeline;
    it is a module-private helper.

    Args:
        s: The partially-converted expression to run the symbol map over.

    Returns:
        str: The expression with commands, fonts, and accents converted to
        Unicode.
    """

    def _repl_mathbb(m: re.Match) -> str:
        """Map a ``\\mathbb{X}`` capture to its blackboard-bold Unicode glyph.

        Looks ``m.group(1)`` up in ``MATHBB_MAP``, falling back to a bracketed
        placeholder for letters that have no precomposed glyph. Used as a
        ``re.sub`` callback inside :func:`_apply_symbol_map`.
        """
        return MATHBB_MAP.get(m.group(1), f"\U0001d539\U0001d539[{m.group(1)}]")

    def _repl_mathcal(m: re.Match) -> str:
        """Map a ``\\mathcal{X}`` capture to its calligraphic Unicode glyph.

        Looks ``m.group(1)`` up in ``MATHCAL_MAP``, falling back to a bracketed
        placeholder when no glyph exists. Used as a ``re.sub`` callback inside
        :func:`_apply_symbol_map`.
        """
        return MATHCAL_MAP.get(
            m.group(1), f"\U0001d4d2\U0001d4d0\U0001d4db[{m.group(1)}]"
        )

    s = re.sub(r"\\mathbb\{([A-Z])\}", _repl_mathbb, s)
    s = re.sub(r"\\mathcal\{([A-Z])\}", _repl_mathcal, s)
    s = re.sub(
        r"\\mathfrak\{([^{}]+)\}",
        lambda m: f"\U0001d509\U0001d52f\U0001d51e\U0001d528[{m.group(1)}]",
        s,
    )

    # Accents
    s = re.sub(r"\\hat\{([^{}]+)\}", "\\1\u0302", s)
    s = re.sub(r"\\bar\{([^{}]+)\}", "\\1\u0304", s)
    s = re.sub(r"\\tilde\{([^{}]+)\}", "\\1\u0303", s)
    s = re.sub(r"\\vec\{([^{}]+)\}", "\\1\u20d7", s)

    def _repl_dot(m: re.Match) -> str:
        """Render a ``\\dot{x}`` capture as a single combining-dot accent.

        Appends the combining dot-above to a lone letter, but degrades to a
        readable ``{c}-dot`` suffix for multi-character content that cannot carry
        a combining mark. Used as a ``re.sub`` callback inside
        :func:`_apply_symbol_map`.
        """
        c = m.group(1)
        return (c + "\u0307") if len(c) == 1 and c.isalpha() else f"{c}-dot"

    def _repl_ddot(m: re.Match) -> str:
        """Render a ``\\ddot{x}`` capture as a combining-diaeresis accent.

        Appends the combining diaeresis to a lone letter, degrading to a
        ``{c}-ddot`` suffix for multi-character content. Used as a ``re.sub``
        callback inside :func:`_apply_symbol_map`.
        """
        c = m.group(1)
        return (c + "\u0308") if len(c) == 1 and c.isalpha() else f"{c}-ddot"

    s = re.sub(r"\\dot\{([^{}]+)\}", _repl_dot, s)
    s = re.sub(r"\\ddot\{([^{}]+)\}", _repl_ddot, s)

    # Binomial / modulo
    s = re.sub(
        r"\\binom\{([^{}]+)\}\{([^{}]+)\}",
        lambda m: f"C({m.group(1)},{m.group(2)})",
        s,
    )
    s = re.sub(r"\\pmod\{([^{}]+)\}", r"(mod \1)", s)

    # Literal braces
    s = s.replace(r"\{", "{").replace(r"\}", "}")

    # Text wrappers
    s = re.sub(r"\\text\(([^()]+)\)", r"\1", s)
    s = re.sub(r"\\mathbf\{([^{}]+)\}", r"**\1**", s)
    s = re.sub(
        r"\\(?:text|mathrm|textrm|mathit|mathsf|mathtt)\{([^{}]+)\}",
        r"\1",
        s,
    )

    s = s.replace("\\\\", "\n")
    # Strip the \left / \right delimiter-sizing commands only -- the negative
    # lookahead stops them from eating the prefix of longer commands that merely
    # start with "left"/"right" (e.g. \leftarrow, \rightarrow, \leftrightarrow,
    # \leftharpoonup), which must survive to the symbol-map lookup below.
    s = re.sub(r"\\left(?![A-Za-z])\s*", "", s)
    s = re.sub(r"\\right(?![A-Za-z])\s*", "", s)

    def _repl_cmd(m: re.Match) -> str:
        """Map a bare ``\\command`` capture through ``LATEX_TO_UNICODE``.

        Returns the Unicode symbol for a known command name, or the original
        matched text (``m.group(0)``) unchanged for commands not in the table.
        Used as the final ``re.sub`` callback inside :func:`_apply_symbol_map`.
        """
        return LATEX_TO_UNICODE.get(m.group(1), m.group(0))

    s = re.sub(r"\\([A-Za-z]+)", _repl_cmd, s)
    return s


# ---------------------------------------------------------------------------
# Superscript / subscript
# ---------------------------------------------------------------------------


def _convert_sup_sub(s: str) -> str:
    """Convert ``^`` superscripts and ``_`` subscripts to Unicode where possible.

    Handles both braced (``^{...}``/``_{...}``) and single-character
    (``^2``/``_3``) forms via four nested callbacks, mapping each character
    through ``SUPERSCRIPT_MAP``/``SUBSCRIPT_MAP``. When a span contains anything
    that lacks a Unicode super/subscript glyph it preserves a readable caret/
    underscore fallback rather than dropping characters. Pure string work with no
    I/O; reads the module-level super/subscript tables.

    Called by :func:`_process_math` after the symbol map; it is a module-private
    helper.

    Args:
        s: The partially-converted expression containing super/subscripts.

    Returns:
        str: The expression with super/subscripts rendered as Unicode where the
        glyphs exist, and as caret/underscore notation otherwise.
    """

    def _repl_super_block(m: re.Match) -> str:
        """Convert a braced ``^{...}`` superscript span to Unicode.

        Maps each character through ``SUPERSCRIPT_MAP``, but bails out to a
        ``^{...}`` literal for any span it cannot fully render. Used as a
        ``re.sub`` callback inside :func:`_convert_sup_sub`.
        """
        content = m.group(1)
        result = ""
        for ch in content:
            if ch in SUPERSCRIPT_MAP:
                result += SUPERSCRIPT_MAP[ch]
            elif ch.isalpha() and len(content) == 1:
                result += SUPERSCRIPT_MAP.get(ch, f"^{{{ch}}}")  # noqa: E501
            else:
                return f"^{{{content}}}"
        return result

    s = re.sub(r"\^\{([^{}]+)\}", _repl_super_block, s)

    def _repl_super_single(m: re.Match) -> str:
        """Convert a single-character ``^x`` superscript to Unicode.

        Looks the character up in ``SUPERSCRIPT_MAP``, falling back to a literal
        ``^x`` when no glyph exists. Used as a ``re.sub`` callback inside
        :func:`_convert_sup_sub`.
        """
        return SUPERSCRIPT_MAP.get(m.group(1), f"^{m.group(1)}")

    s = re.sub(r"\^([0-9a-z])", _repl_super_single, s)

    def _repl_sub_block(m: re.Match) -> str:
        """Convert a braced ``_{...}`` subscript span to Unicode.

        Renders the span via ``SUBSCRIPT_MAP`` only when every character has a
        subscript glyph; otherwise preserves a ``_{...}`` literal. Used as a
        ``re.sub`` callback inside :func:`_convert_sup_sub`.
        """
        content = m.group(1)
        if all(ch in SUBSCRIPT_MAP for ch in content):
            return "".join(SUBSCRIPT_MAP[ch] for ch in content)
        return f"_{{{content}}}"

    s = re.sub(r"_\{([^{}]+)\}", _repl_sub_block, s)

    def _repl_sub_single(m: re.Match) -> str:
        """Convert a single-character ``_x`` subscript to Unicode.

        Looks the character up in ``SUBSCRIPT_MAP``, falling back to a literal
        ``_x`` when no glyph exists. Used as a ``re.sub`` callback inside
        :func:`_convert_sup_sub`.
        """
        return SUBSCRIPT_MAP.get(m.group(1), f"_{m.group(1)}")

    s = re.sub(r"_([0-9])", _repl_sub_single, s)
    return s


# ---------------------------------------------------------------------------
# Nested structures (\frac, \sqrt)
# ---------------------------------------------------------------------------


def _is_simple_expression(expr: str) -> bool:
    """Judge whether an expression is atomic enough to skip wrapping in parens.

    A formatting heuristic used when laying out fractions and roots: a "simple"
    operand (a single token, or one already parenthesized) can be inlined as-is,
    while anything containing division, products, big operators, or a mix of
    additive and sub/superscript structure must be parenthesized to stay
    unambiguous in linear Unicode form. Pure string inspection with no I/O.

    Called by :func:`_convert_nested_structures` to decide whether each
    ``\\frac`` numerator/denominator and ``\\sqrt`` radicand needs surrounding
    parentheses; it is a module-private helper.

    Args:
        expr: The already-converted sub-expression to classify.

    Returns:
        bool: ``True`` if the expression can be inlined without parentheses,
        ``False`` if it should be parenthesized.
    """
    if not expr:
        return True
    expr = expr.strip()
    if expr.startswith("(") and expr.endswith(")"):
        return True
    if "/" in expr or "\u00d7" in expr or "\u00b7" in expr:
        return False
    if "\u2211" in expr or "\u222b" in expr or "\u220f" in expr:
        return False
    ops = "+" in expr or "-" in expr
    subs = "^" in expr or "_" in expr or "\u221a" in expr
    if ops and subs:
        return False
    if re.match(r"^[^\s+\-*/]+$", expr):
        return True
    return False


def _convert_nested_structures(s: str) -> str:
    """Convert ``\\sqrt{...}`` and ``\\frac{...}{...}`` from the inside out.

    Iteratively rewrites roots as ``√(...)`` and fractions as ``num/den``, using
    :func:`_extract_braced` to grab balanced arguments and
    :func:`_is_simple_expression` to decide when each operand needs parentheses.
    The loop (capped at 10 passes) lets nested constructs collapse one layer per
    iteration until no further change occurs, so ``\\frac`` inside ``\\frac`` is
    handled correctly. Pure string work with no I/O.

    Called by :func:`_process_math` before the symbol map runs; it is a
    module-private helper.

    Args:
        s: The expression possibly containing nested roots and fractions.

    Returns:
        str: The expression with roots and fractions rendered in linear notation.
    """
    max_iterations = 10
    for _ in range(max_iterations):
        changed = False

        # \sqrt{...}
        pos = 0
        result: list[str] = []
        while pos < len(s):
            match = re.search(r"\\sqrt\{", s[pos:])
            if not match:
                result.append(s[pos:])
                break
            result.append(s[pos : pos + match.start()])
            brace_start = pos + match.end() - 1
            content, end_pos = _extract_braced(s, brace_start)
            if content:
                sq = "\u221a"
                simple = _is_simple_expression(content)
                part = content if simple else f"({content})"
                result.append(f"{sq}{part}")
                pos = end_pos
                changed = True
            else:
                result.append(s[pos : pos + match.end()])
                pos += match.end()
        s = "".join(result)

        # \frac{...}{...}
        pos = 0
        result = []
        while pos < len(s):
            match = re.search(r"\\frac\{", s[pos:])
            if not match:
                result.append(s[pos:])
                break
            result.append(s[pos : pos + match.start()])
            brace_start = pos + match.end() - 1
            numerator, next_pos = _extract_braced(s, brace_start)
            if not numerator:
                result.append(s[pos : pos + match.end()])
                pos += match.end()
                continue
            denominator, end_pos = _extract_braced(s, next_pos)
            if not denominator:
                result.append(s[pos : pos + match.end()])
                pos += match.end()
                continue
            num_str = (
                numerator if _is_simple_expression(numerator) else f"({numerator})"
            )
            den_str = (
                denominator
                if _is_simple_expression(denominator)
                else f"({denominator})"
            )
            result.append(f"{num_str}/{den_str}")
            pos = end_pos
            changed = True
        s = "".join(result)

        if not changed:
            break
    return s


# ---------------------------------------------------------------------------
# Limits, sums, integrals, products
# ---------------------------------------------------------------------------


def _convert_limits_and_bounds(s: str) -> str:
    """Convert ``\\lim``, ``\\sum``, ``\\int``, and ``\\prod`` with their bounds.

    Rewrites limit expressions as ``lim[...]`` and the big operators to their
    Unicode glyphs (∑, ∫, ∏) annotated with any subscript/superscript bounds in a
    readable ``[lower to upper]`` form, scanning each operator manually and using
    :func:`_extract_bounds` to pull the attached ``_``/``^`` arguments. Done early
    so the bounds are captured before the generic symbol map would otherwise
    strip the backslash commands. Pure string work with no I/O.

    Called by :func:`_process_math` as the second pipeline stage; it is a
    module-private helper.

    Args:
        s: The expression possibly containing bounded operators.

    Returns:
        str: The expression with limits and big operators converted and their
        bounds inlined.
    """
    s = re.sub(r"\\lim_\{([^{}]+)\}", lambda m: f"lim[{m.group(1)}]", s)
    s = re.sub(r"\\lim", "lim", s)

    cmds = [("\\sum", "\u2211"), ("\\int", "\u222b"), ("\\prod", "\u220f")]
    for cmd, symbol in cmds:
        pos = 0
        result: list[str] = []
        while pos < len(s):
            match = re.search(re.escape(cmd), s[pos:])
            if not match:
                result.append(s[pos:])
                break
            result.append(s[pos : pos + match.start()])
            bound_start = pos + match.end()
            lower, upper, end_pos = _extract_bounds(s, bound_start)
            if lower and upper:
                result.append(f"{symbol}[{lower} to {upper}]")
            elif lower:
                result.append(f"{symbol}[{lower}]")
            else:
                result.append(symbol)
            pos = end_pos
        s = "".join(result)

    return s


def _extract_bounds(s: str, pos: int) -> Tuple[str, str, int]:
    """Read the ``_lower`` and ``^upper`` bounds attached to an operator.

    A small scanner that, starting at *pos*, optionally consumes a subscript and
    then a superscript, each of which may be a braced group (via
    :func:`_extract_braced`), a backslash command, or a single alphanumeric
    character. Used to capture the bounds of ``\\sum``/``\\int``/``\\prod`` so
    :func:`_convert_limits_and_bounds` can render them inline. Pure string
    scanning with no I/O.

    Called only by :func:`_convert_limits_and_bounds`; it is a module-private
    helper.

    Args:
        s: The string being scanned.
        pos: Index immediately after the operator token, where bounds may begin.

    Returns:
        Tuple[str, str, int]: The lower bound, the upper bound (either may be an
        empty string when absent), and the index just past the consumed bounds.
    """
    lower = ""
    upper = ""

    if pos < len(s) and s[pos] == "_":
        pos += 1
        if pos < len(s):
            if s[pos] == "{":
                lower, pos = _extract_braced(s, pos)
            elif s[pos] == "\\":
                m = re.match(r"\\([A-Za-z]+)", s[pos:])
                if m:
                    lower = "\\" + m.group(1)
                    pos += len(m.group(0))
            elif s[pos].isalnum():
                lower = s[pos]
                pos += 1

    if pos < len(s) and s[pos] == "^":
        pos += 1
        if pos < len(s):
            if s[pos] == "{":
                upper, pos = _extract_braced(s, pos)
            elif s[pos] == "\\":
                m = re.match(r"\\([A-Za-z]+)", s[pos:])
                if m:
                    upper = "\\" + m.group(1)
                    pos += len(m.group(0))
            elif s[pos].isalnum():
                upper = s[pos]
                pos += 1

    return lower, upper, pos


# ---------------------------------------------------------------------------
# Matrix environments
# ---------------------------------------------------------------------------


def _convert_matrix_environments(s: str) -> str:
    """Lay out LaTeX matrix environments as bracketed multi-line text.

    Matches the ``pmatrix``/``bmatrix``/``matrix``/``vmatrix``/``Vmatrix``
    environments, splits their body on ``\\\\`` row breaks and ``&`` column
    separators, and re-emits each as a row-per-line block wrapped in the
    delimiter pair appropriate to the environment (parentheses, square brackets,
    single or double bars). Done first in the pipeline so the row/column markers
    are interpreted before later stages touch them. Pure string work with no
    I/O.

    Called by :func:`_process_math` as the first pipeline stage; it is a
    module-private helper.

    Args:
        s: The expression possibly containing matrix environments.

    Returns:
        str: The expression with matrix environments rendered as bracketed
        text grids.
    """
    matrix_envs = r"(pmatrix|bmatrix|matrix|vmatrix|Vmatrix)"
    beg = rf"\\begin\{{{matrix_envs}\}}\s*([\s\S]*?)\s*"
    end = rf"\\end\{{{matrix_envs}\}}"
    pattern = beg + end

    def _repl(m: re.Match) -> str:
        """Format one matched matrix environment into a bracketed text grid.

        Splits the captured body into rows and cells and wraps the result in the
        delimiter pair matching the environment name. Used as the ``re.sub``
        callback inside :func:`_convert_matrix_environments`.
        """
        env = m.group(1)
        rows = re.split(r"\\\\", m.group(2))
        fmt = []
        for row in rows:
            cells = re.split(r"&", row.strip())
            fmt.append(" ".join(c.strip() for c in cells))
        body = "\n".join(fmt)
        if env == "pmatrix":
            return f"(\n{body}\n)"
        if env == "bmatrix":
            return f"[\n{body}\n]"
        if env == "vmatrix":
            return f"|\n{body}\n|"
        if env == "Vmatrix":
            return f"||\n{body}\n||"
        return body

    return re.sub(pattern, _repl, s)


# ---------------------------------------------------------------------------
# Final cleanup
# ---------------------------------------------------------------------------


def _final_cleanup(s: str) -> str:
    """Strip leftover LaTeX scaffolding and normalize whitespace.

    The last pipeline stage: it removes any surviving ``\\begin``/``\\end``
    wrappers, drops the backslash from unconverted commands, deletes stray
    braces, and collapses redundant spaces and blank lines so the rendered math
    reads cleanly. Best-effort tidy-up of whatever earlier stages did not
    convert. Pure string work with no I/O.

    Called by :func:`_process_math` as the final stage; it is a module-private
    helper.

    Args:
        s: The mostly-converted expression to finalize.

    Returns:
        str: The cleaned, whitespace-normalized expression.
    """
    s = re.sub(r"\\(begin|end)\{[^}]*\}", "", s)
    s = re.sub(r"\\([A-Za-z]+)", r"\1", s)
    s = re.sub(r"[{}]", "", s)
    s = re.sub(r"\s+\n", "\n", s)
    s = re.sub(r"\n{3,}", "\n\n", s)
    s = re.sub(r" {2,}", " ", s)
    return s.strip()