Source code for latex_converter

"""LaTeX to Discord Formatting Converter.

Converts LaTeX/math formatting to Discord-friendly text with Unicode symbols.

Strategy:
- Preserve fenced and inline code as-is.
- Convert display math to plain text (no code fences).
- Convert inline math to inline code with lightweight symbol mapping.
- Replace common LaTeX commands with Unicode equivalents.
- Handle nested ``\\frac{}{}``, ``\\sqrt{}``, superscript/subscript to Unicode.
- Support matrix environments and special math fonts.
"""

import re
import logging
from typing import Tuple

logger = logging.getLogger(__name__)

# ---------------------------------------------------------------------------
# Unicode mapping tables (module-level for reuse)
# ---------------------------------------------------------------------------

LATEX_TO_UNICODE = {
    # Greek lowercase
    "alpha": "\u03b1",
    "beta": "\u03b2",
    "gamma": "\u03b3",
    "delta": "\u03b4",
    "epsilon": "\u03b5",
    "zeta": "\u03b6",
    "eta": "\u03b7",
    "theta": "\u03b8",
    "iota": "\u03b9",
    "kappa": "\u03ba",
    "lambda": "\u03bb",
    "mu": "\u03bc",
    "nu": "\u03bd",
    "xi": "\u03be",
    "pi": "\u03c0",
    "rho": "\u03c1",
    "sigma": "\u03c3",
    "tau": "\u03c4",
    "upsilon": "\u03c5",
    "phi": "\u03c6",
    "chi": "\u03c7",
    "psi": "\u03c8",
    "omega": "\u03c9",
    # Greek uppercase
    "Gamma": "\u0393",
    "Delta": "\u0394",
    "Theta": "\u0398",
    "Lambda": "\u039b",
    "Xi": "\u039e",
    "Pi": "\u03a0",
    "Sigma": "\u03a3",
    "Upsilon": "\u03a5",
    "Phi": "\u03a6",
    "Psi": "\u03a8",
    "Omega": "\u03a9",
    # Greek variants
    "varepsilon": "\u03b5",
    "vartheta": "\u03d1",
    "varpi": "\u03d6",
    "varrho": "\u03f1",
    "varsigma": "\u03c2",
    "varphi": "\u03c6",
    # Operators and misc
    "times": "\u00d7",
    "cdot": "\u00b7",
    "pm": "\u00b1",
    "mp": "\u2213",
    "leq": "\u2264",
    "geq": "\u2265",
    "neq": "\u2260",
    "approx": "\u2248",
    "sim": "\u223c",
    "infty": "\u221e",
    "infinity": "\u221e",
    "to": "\u2192",
    "rightarrow": "\u2192",
    "leftarrow": "\u2190",
    "Rightarrow": "\u21d2",
    "Leftarrow": "\u21d0",
    "Leftrightarrow": "\u21d4",
    # Arrows (Knuth up-arrow / tetration notation lives here: a \uparrow\uparrow n)
    "leftrightarrow": "\u2194",
    "uparrow": "\u2191",
    "downarrow": "\u2193",
    "updownarrow": "\u2195",
    "Uparrow": "\u21d1",
    "Downarrow": "\u21d3",
    "Updownarrow": "\u21d5",
    "longleftarrow": "\u27f5",
    "longrightarrow": "\u27f6",
    "longleftrightarrow": "\u27f7",
    "Longleftarrow": "\u27f8",
    "Longrightarrow": "\u27f9",
    "Longleftrightarrow": "\u27fa",
    "mapsto": "\u21a6",
    "longmapsto": "\u27fc",
    "hookleftarrow": "\u21a9",
    "hookrightarrow": "\u21aa",
    "nearrow": "\u2197",
    "searrow": "\u2198",
    "swarrow": "\u2199",
    "nwarrow": "\u2196",
    "nleftarrow": "\u219a",
    "nrightarrow": "\u219b",
    "nleftrightarrow": "\u21ae",
    "nLeftarrow": "\u21cd",
    "nRightarrow": "\u21cf",
    "nLeftrightarrow": "\u21ce",
    "leftleftarrows": "\u21c7",
    "rightrightarrows": "\u21c9",
    "leftrightarrows": "\u21c6",
    "rightleftarrows": "\u21c4",
    "leftharpoonup": "\u21bc",
    "leftharpoondown": "\u21bd",
    "rightharpoonup": "\u21c0",
    "rightharpoondown": "\u21c1",
    "upharpoonright": "\u21be",
    "upharpoonleft": "\u21bf",
    "downharpoonright": "\u21c2",
    "downharpoonleft": "\u21c3",
    "leftrightharpoons": "\u21cb",
    "rightleftharpoons": "\u21cc",
    "partial": "\u2202",
    "nabla": "\u2207",
    "sum": "\u2211",
    "prod": "\u220f",
    "int": "\u222b",
    "forall": "\u2200",
    "exists": "\u2203",
    "neg": "\u00ac",
    "lor": "\u2228",
    "land": "\u2227",
    "oplus": "\u2295",
    "otimes": "\u2297",
    # Set theory
    "in": "\u2208",
    "notin": "\u2209",
    "subset": "\u2282",
    "supset": "\u2283",
    "subseteq": "\u2286",
    "supseteq": "\u2287",
    "cup": "\u222a",
    "cap": "\u2229",
    "emptyset": "\u2205",
    "varnothing": "\u2205",
    "setminus": "\u2216",
    "complement": "\u2201",
    # Logic
    "implies": "\u21d2",
    "iff": "\u21d4",
    "equiv": "\u2261",
    "therefore": "\u2234",
    "because": "\u2235",
    # Relations
    "cong": "\u2245",
    "ncong": "\u2247",
    "propto": "\u221d",
    "prec": "\u227a",
    "succ": "\u227b",
    "preceq": "\u2aaf",
    "succeq": "\u2ab0",
    "parallel": "\u2225",
    "perp": "\u22a5",
    "asymp": "\u224d",
    # Dots
    "ldots": "...",
    "cdots": "\u22ef",
    "vdots": "\u22ee",
    "ddots": "\u22f1",
    "dots": "...",
    # Brackets and delimiters
    "langle": "\u27e8",
    "rangle": "\u27e9",
    "lfloor": "\u230a",
    "rfloor": "\u230b",
    "lceil": "\u2308",
    "rceil": "\u2309",
    "lbrace": "{",
    "rbrace": "}",
    # Trigonometric functions (preserve as-is)
    "sin": "sin",
    "cos": "cos",
    "tan": "tan",
    "cot": "cot",
    "sec": "sec",
    "csc": "csc",
    "arcsin": "arcsin",
    "arccos": "arccos",
    "arctan": "arctan",
    "sinh": "sinh",
    "cosh": "cosh",
    "tanh": "tanh",
    "coth": "coth",
    "sech": "sech",
    "csch": "csch",
    # Logarithmic and exponential
    "ln": "ln",
    "log": "log",
    "exp": "exp",
    "lg": "lg",
    # Mathematical functions
    "det": "det",
    "dim": "dim",
    "ker": "ker",
    "deg": "deg",
    "gcd": "gcd",
    "lcm": "lcm",
    "arg": "arg",
    "hom": "hom",
    "Pr": "Pr",
    "mod": "mod",
    # Limits and bounds
    "lim": "lim",
    "sup": "sup",
    "inf": "inf",
    "max": "max",
    "min": "min",
    "limsup": "lim sup",
    "liminf": "lim inf",
    # Other symbols
    "ell": "\u2113",
    "hbar": "\u210f",
    "Re": "\u211c",
    "Im": "\u2111",
    "wp": "\u2118",
    "angle": "\u2220",
    "triangle": "\u25b3",
    "square": "\u25a1",
    "diamond": "\u25c7",
    "star": "\u22c6",
    "dagger": "\u2020",
    "ddagger": "\u2021",
}

MATHBB_MAP = {
    "R": "\u211d",
    "C": "\u2102",
    "N": "\u2115",
    "Z": "\u2124",
    "Q": "\u211a",
    "P": "\u2119",
    "A": "\U0001d538",
    "E": "\U0001d53c",
    "H": "\u210d",
}

MATHCAL_MAP = {
    "A": "\U0001d49c",
    "B": "\U0001d435",
    "C": "\U0001d49e",
    "D": "\U0001d49f",
    "E": "\U0001d438",
    "F": "\U0001d439",
    "G": "\U0001d4a2",
    "H": "\U0001d43b",
    "I": "\U0001d43c",
    "J": "\U0001d4a5",
    "K": "\U0001d4a6",
    "L": "\U0001d43f",
    "M": "\U0001d440",
    "N": "\U0001d4a9",
    "O": "\U0001d4aa",
    "P": "\U0001d4ab",
    "Q": "\U0001d4ac",
    "R": "\U0001d445",
    "S": "\U0001d4ae",
    "T": "\U0001d4af",
    "U": "\U0001d4b0",
    "V": "\U0001d4b1",
    "W": "\U0001d4b2",
    "X": "\U0001d4b3",
    "Y": "\U0001d4b4",
    "Z": "\U0001d4b5",
}

SUPERSCRIPT_MAP = {
    "0": "\u2070",
    "1": "\u00b9",
    "2": "\u00b2",
    "3": "\u00b3",
    "4": "\u2074",
    "5": "\u2075",
    "6": "\u2076",
    "7": "\u2077",
    "8": "\u2078",
    "9": "\u2079",
    "+": "\u207a",
    "-": "\u207b",
    "(": "\u207d",
    ")": "\u207e",
    "=": "\u207c",
    "n": "\u207f",
    "i": "\u2071",
}

SUBSCRIPT_MAP = {
    "0": "\u2080",
    "1": "\u2081",
    "2": "\u2082",
    "3": "\u2083",
    "4": "\u2084",
    "5": "\u2085",
    "6": "\u2086",
    "7": "\u2087",
    "8": "\u2088",
    "9": "\u2089",
    "+": "\u208a",
    "-": "\u208b",
    "(": "\u208d",
    ")": "\u208e",
    "=": "\u208c",
}


# ---------------------------------------------------------------------------
# Public API
# ---------------------------------------------------------------------------


[docs] def convert_latex_to_discord(text: str) -> str: r"""Convert common LaTeX/math formatting to Discord-friendly text. The module's sole public entry point. Preserves fenced code blocks and inline code spans untouched; everything else is scanned for LaTeX delimiters and rewritten into Unicode symbols so the bot's replies render cleanly in Discord, which has no native math support. Cheaply bails out when the text contains no LaTeX delimiters at all, and on any conversion error returns the original text unchanged (logging a warning) so a malformed expression can never blank out a reply. Delegates the real work to :func:`_do_convert`. Called by ``postprocess_response`` in ``response_postprocessor.py`` as one stage of the outbound message-formatting pipeline (after raw-LaTeX wrapping, before table conversion). Args: text: The candidate reply text, possibly containing LaTeX math and formatting commands. Returns: str: The converted text, or the input unchanged when it is empty, not a string, contains no LaTeX, or conversion raised. """ if not isinstance(text, str) or not text: return text or "" if not re.search(r"(\\\[|\\\(|\\begin\{|\$\$|\$)", text): return text try: return _do_convert(text) except Exception as e: logger.warning("LaTeX conversion failed: %s", e) return text
# --------------------------------------------------------------------------- # Internal implementation # --------------------------------------------------------------------------- def _do_convert(text: str) -> str: """Split text on fenced code blocks and convert only the non-code parts. The top-level driver behind :func:`convert_latex_to_discord`. Walks the triple-backtick fenced blocks with a regex, emitting each fenced block verbatim while routing the surrounding prose through :func:`_transform_non_code`. This guarantees code samples (which may contain literal LaTeX-looking characters) are never mangled. Pure string assembly with no I/O. Called by :func:`convert_latex_to_discord` once the input has passed the cheap "has any LaTeX" gate; it is a module-private helper. Args: text: The full message text known to contain at least one LaTeX delimiter. Returns: str: The reassembled text with non-code segments converted and fenced code blocks preserved. """ out_parts: list[str] = [] fence_pattern = re.compile(r"```[\s\S]*?```", re.MULTILINE) last_end = 0 for m in fence_pattern.finditer(text): before = text[last_end : m.start()] if before: out_parts.append(_transform_non_code(before)) out_parts.append(m.group(0)) last_end = m.end() tail = text[last_end:] if tail: out_parts.append(_transform_non_code(tail)) return "".join(out_parts) def _transform_non_code(segment: str) -> str: """Convert a non-fenced segment while preserving inline code spans. Operates one level below :func:`_do_convert`: it isolates single-backtick inline code spans (left untouched) and runs everything between them through :func:`_transform_text_styles` (bold/italic/underline commands) followed by :func:`_transform_math_segments` (display and inline math). Pure string work with no I/O. Called by :func:`_do_convert` for each stretch of text that lies outside a fenced code block; it is a module-private helper. Args: segment: A run of message text that is not inside a fenced code block but may still contain inline code spans and LaTeX. Returns: str: The segment with text styles and math converted and inline code spans preserved verbatim. """ pieces: list[str] = [] idx = 0 for im in re.finditer(r"`[^`]*`", segment): part = segment[idx : im.start()] part = _transform_text_styles(part) part = _transform_math_segments(part) pieces.append(part) pieces.append(im.group(0)) idx = im.end() part = segment[idx:] part = _transform_text_styles(part) part = _transform_math_segments(part) pieces.append(part) return "".join(pieces) # --------------------------------------------------------------------------- # Math segment transforms # --------------------------------------------------------------------------- def _transform_math_segments(s: str) -> str: # Display math: \[...\] """Find every math delimiter in a segment and convert its contents. Sweeps the segment for each supported math wrapper -- display ``\\[...\\]``, the equation/align/gather/multline environments, ``$$...$$``, inline ``\\(...\\)``, and inline ``$...$`` -- and routes the inner expression through :func:`_process_math`. Display math is emitted as plain text while inline math is wrapped in backticks; the ambiguous single-dollar case is delegated to :func:`_repl_inline_dollar` to avoid eating currency/prose. Pure regex string work with no I/O. Called by :func:`_transform_non_code` after text-style conversion; it is a module-private helper. Args: s: The text (outside code spans) whose math delimiters should be converted. Returns: str: The text with all recognized math regions replaced by their Unicode-converted equivalents. """ s = re.sub( r"\\\[\s*([\s\S]*?)\s*\\\]", lambda m: _process_math(m.group(1).strip()), s, ) # equation, align, gather, multline environments s = re.sub( r"\\begin\{(equation\*?|align\*?|gather\*?|multline\*?)\}" r"\s*([\s\S]*?)\s*\\end\{\1\}", lambda m: _process_math(m.group(2).strip()), s, ) # $$...$$ s = re.sub( r"\$\$\s*([\s\S]*?)\s*\$\$", lambda m: _process_math(m.group(1).strip()), s, ) # Inline \(...\) s = re.sub( r"\\\(\s*([\s\S]*?)\s*\\\)", lambda m: f"`{_process_math(m.group(1))}`", s, ) # Inline $...$ -- enforce CommonMark/pandoc dollar-math delimiter rules so # currency ("$5", "$5 and $10", "$5-$10") is never mistaken for math: # * the opening "$" is followed by a non-space, non-"$" character; # * the closing "$" is preceded by a non-space, non-"$" character; # * the closing "$" is not immediately followed by a digit (so currency # ranges like "$5-$10" stay intact); # * the span contains neither "$" nor a newline. # These positional rules -- not content guessing -- are what separate inline # math from money, which is why a factorial like "$7!!!$" or a parenthesized # expression like "$(5040!)!$" now converts cleanly instead of leaking its # raw delimiters into the chat UI. s = re.sub( r"(?<![\\$])\$(?=[^\s$])([^$\n]*?)(?<=[^\s$])\$(?!\d)", _repl_inline_dollar, s, ) return s def _repl_inline_dollar(m: re.Match) -> str: """Convert a positionally-validated ``$...$`` inline-math span to Unicode. The single-dollar delimiter is genuinely ambiguous in chat -- ``$5`` and ``$5 to $10`` are money, while ``$x^2$`` and ``$7!!!$`` are math. The disambiguation lives in the *delimiter* rules enforced by the calling regex in :func:`_transform_math_segments` (opening ``$`` followed by a non-space character, closing ``$`` preceded by a non-space character and not followed by a digit), which mirror the CommonMark/pandoc dollar-math convention and reliably exclude currency without inspecting the content. By the time this callback runs the span is already known to be a real inline-math pair, so it converts the inner text via :func:`_process_math` and wraps it in backticks. A single conservative content guard remains: if the span is multi-word natural-language prose with no math content at all (no backslash command, digit, operator, or bracket), it is left exactly as written rather than wrapped in a code span. Pure string classification with no I/O. Passed as the replacement callable to ``re.sub`` inside :func:`_transform_math_segments`; it has no other callers. Args: m: The regex match whose group 1 is the text between the dollar signs. Returns: str: A backtick-wrapped, Unicode-converted expression for genuine inline math, or the original ``$...$`` span unchanged when the content is operator-free multi-word prose. """ inner = m.group(1) has_math = "\\" in inner or any( ch.isdigit() or ch in "^_=+*/<>!|(){}[]" for ch in inner ) if " " in inner.strip() and not has_math: return m.group(0) return f"`{_process_math(inner)}`" def _transform_text_styles(s: str) -> str: """Map LaTeX text-formatting commands onto Discord markdown. Converts the formatting (not math) commands -- ``\\textbf``/``\\mathbf`` to ``**bold**``, ``\\textit``/``\\mathit``/``\\emph`` to ``*italic*``, ``\\underline`` to ``__underline__``, and ``\\texttt``/``\\mathtt`` to inline code -- so styled spans survive into Discord's markdown. Runs on prose outside math regions; pure regex string work with no I/O. Called by :func:`_transform_non_code` before math conversion; it is a module-private helper. Args: s: The text segment (outside code spans) to scan for style commands. Returns: str: The text with recognized LaTeX style commands rewritten as Discord markdown. """ s = re.sub(r"\\(?:textbf|mathbf)\{([^{}]+)\}", r"**\1**", s) s = re.sub(r"\\(?:textit|mathit|emph)\{([^{}]+)\}", r"*\1*", s) s = re.sub(r"\\underline\{([^{}]+)\}", r"__\1__", s) s = re.sub(r"\\(?:texttt|mathtt)\{([^{}]+)\}", r"`\1`", s) return s # --------------------------------------------------------------------------- # Core math processing pipeline # --------------------------------------------------------------------------- def _process_math(content: str) -> str: """Run the full math-conversion pipeline on one extracted expression. The ordered core of the converter: it applies, in sequence, :func:`_convert_matrix_environments`, :func:`_convert_limits_and_bounds`, :func:`_convert_nested_structures` (``\\frac``/``\\sqrt``), :func:`_apply_symbol_map` (commands, accents, fonts), :func:`_convert_sup_sub` (super/subscripts), and finally :func:`_final_cleanup`. Order matters -- e.g. matrices and bounded operators must be handled before the generic symbol map strips backslashes. Pure string transformation with no I/O. Called by :func:`_transform_math_segments` and :func:`_repl_inline_dollar` for the inner text of each math region; it is a module-private helper. Args: content: The raw LaTeX expression (delimiters already stripped) to convert. Returns: str: The expression rendered using Unicode symbols and plain text. """ content = _convert_matrix_environments(content) content = _convert_limits_and_bounds(content) content = _convert_nested_structures(content) content = _apply_symbol_map(content) content = _convert_sup_sub(content) content = _final_cleanup(content) return content # --------------------------------------------------------------------------- # Brace extraction # --------------------------------------------------------------------------- def _extract_braced(s: str, start: int) -> Tuple[str, int]: """Extract one balanced ``{...}`` group, honoring nesting and escapes. A small brace-matching scanner used wherever the converter must grab a LaTeX argument that may itself contain braces (e.g. nested ``\\frac``). It tracks brace depth and skips backslash-escaped characters so the matching close brace is found correctly. Pure string scanning with no I/O. Called by :func:`_convert_nested_structures` (for ``\\frac``/``\\sqrt`` arguments) and :func:`_extract_bounds` (for ``_{...}``/``^{...}`` bounds); it is a module-private helper. Args: s: The string being scanned. start: Index of the opening brace; if ``s[start]`` is not ``{`` the call is a no-op failure. Returns: Tuple[str, int]: A pair of the inner content and the index just past the matching close brace; on failure returns an empty string and the unchanged ``start`` index. """ if start >= len(s) or s[start] != "{": return "", start depth = 0 pos = start while pos < len(s): if s[pos] == "{": depth += 1 elif s[pos] == "}": depth -= 1 if depth == 0: return s[start + 1 : pos], pos + 1 elif s[pos] == "\\" and pos + 1 < len(s): pos += 1 pos += 1 return "", start # --------------------------------------------------------------------------- # Symbol map + accents # --------------------------------------------------------------------------- def _apply_symbol_map(s: str) -> str: """Replace LaTeX commands, fonts, and accents with Unicode equivalents. The broadest conversion stage: it rewrites blackboard-bold (``\\mathbb``), calligraphic (``\\mathcal``), and fraktur fonts; combining accents (``\\hat``/``\\bar``/``\\tilde``/``\\vec``/``\\dot``/``\\ddot``); binomials and ``\\pmod``; literal braces; ``\\text``-family wrappers; line breaks and ``\\left``/``\\right`` sizing; and finally maps every remaining ``\\command`` through ``LATEX_TO_UNICODE`` (leaving unknown commands intact). Several nested closures handle the regex-callback cases. Pure string work with no I/O; reads the module-level ``MATHBB_MAP``/``MATHCAL_MAP``/ ``LATEX_TO_UNICODE`` tables. Called by :func:`_process_math` as the symbol-mapping stage of the pipeline; it is a module-private helper. Args: s: The partially-converted expression to run the symbol map over. Returns: str: The expression with commands, fonts, and accents converted to Unicode. """ def _repl_mathbb(m: re.Match) -> str: """Map a ``\\mathbb{X}`` capture to its blackboard-bold Unicode glyph. Looks ``m.group(1)`` up in ``MATHBB_MAP``, falling back to a bracketed placeholder for letters that have no precomposed glyph. Used as a ``re.sub`` callback inside :func:`_apply_symbol_map`. """ return MATHBB_MAP.get(m.group(1), f"\U0001d539\U0001d539[{m.group(1)}]") def _repl_mathcal(m: re.Match) -> str: """Map a ``\\mathcal{X}`` capture to its calligraphic Unicode glyph. Looks ``m.group(1)`` up in ``MATHCAL_MAP``, falling back to a bracketed placeholder when no glyph exists. Used as a ``re.sub`` callback inside :func:`_apply_symbol_map`. """ return MATHCAL_MAP.get( m.group(1), f"\U0001d4d2\U0001d4d0\U0001d4db[{m.group(1)}]" ) s = re.sub(r"\\mathbb\{([A-Z])\}", _repl_mathbb, s) s = re.sub(r"\\mathcal\{([A-Z])\}", _repl_mathcal, s) s = re.sub( r"\\mathfrak\{([^{}]+)\}", lambda m: f"\U0001d509\U0001d52f\U0001d51e\U0001d528[{m.group(1)}]", s, ) # Accents s = re.sub(r"\\hat\{([^{}]+)\}", "\\1\u0302", s) s = re.sub(r"\\bar\{([^{}]+)\}", "\\1\u0304", s) s = re.sub(r"\\tilde\{([^{}]+)\}", "\\1\u0303", s) s = re.sub(r"\\vec\{([^{}]+)\}", "\\1\u20d7", s) def _repl_dot(m: re.Match) -> str: """Render a ``\\dot{x}`` capture as a single combining-dot accent. Appends the combining dot-above to a lone letter, but degrades to a readable ``{c}-dot`` suffix for multi-character content that cannot carry a combining mark. Used as a ``re.sub`` callback inside :func:`_apply_symbol_map`. """ c = m.group(1) return (c + "\u0307") if len(c) == 1 and c.isalpha() else f"{c}-dot" def _repl_ddot(m: re.Match) -> str: """Render a ``\\ddot{x}`` capture as a combining-diaeresis accent. Appends the combining diaeresis to a lone letter, degrading to a ``{c}-ddot`` suffix for multi-character content. Used as a ``re.sub`` callback inside :func:`_apply_symbol_map`. """ c = m.group(1) return (c + "\u0308") if len(c) == 1 and c.isalpha() else f"{c}-ddot" s = re.sub(r"\\dot\{([^{}]+)\}", _repl_dot, s) s = re.sub(r"\\ddot\{([^{}]+)\}", _repl_ddot, s) # Binomial / modulo s = re.sub( r"\\binom\{([^{}]+)\}\{([^{}]+)\}", lambda m: f"C({m.group(1)},{m.group(2)})", s, ) s = re.sub(r"\\pmod\{([^{}]+)\}", r"(mod \1)", s) # Literal braces s = s.replace(r"\{", "{").replace(r"\}", "}") # Text wrappers s = re.sub(r"\\text\(([^()]+)\)", r"\1", s) s = re.sub(r"\\mathbf\{([^{}]+)\}", r"**\1**", s) s = re.sub( r"\\(?:text|mathrm|textrm|mathit|mathsf|mathtt)\{([^{}]+)\}", r"\1", s, ) s = s.replace("\\\\", "\n") # Strip the \left / \right delimiter-sizing commands only -- the negative # lookahead stops them from eating the prefix of longer commands that merely # start with "left"/"right" (e.g. \leftarrow, \rightarrow, \leftrightarrow, # \leftharpoonup), which must survive to the symbol-map lookup below. s = re.sub(r"\\left(?![A-Za-z])\s*", "", s) s = re.sub(r"\\right(?![A-Za-z])\s*", "", s) def _repl_cmd(m: re.Match) -> str: """Map a bare ``\\command`` capture through ``LATEX_TO_UNICODE``. Returns the Unicode symbol for a known command name, or the original matched text (``m.group(0)``) unchanged for commands not in the table. Used as the final ``re.sub`` callback inside :func:`_apply_symbol_map`. """ return LATEX_TO_UNICODE.get(m.group(1), m.group(0)) s = re.sub(r"\\([A-Za-z]+)", _repl_cmd, s) return s # --------------------------------------------------------------------------- # Superscript / subscript # --------------------------------------------------------------------------- def _convert_sup_sub(s: str) -> str: """Convert ``^`` superscripts and ``_`` subscripts to Unicode where possible. Handles both braced (``^{...}``/``_{...}``) and single-character (``^2``/``_3``) forms via four nested callbacks, mapping each character through ``SUPERSCRIPT_MAP``/``SUBSCRIPT_MAP``. When a span contains anything that lacks a Unicode super/subscript glyph it preserves a readable caret/ underscore fallback rather than dropping characters. Pure string work with no I/O; reads the module-level super/subscript tables. Called by :func:`_process_math` after the symbol map; it is a module-private helper. Args: s: The partially-converted expression containing super/subscripts. Returns: str: The expression with super/subscripts rendered as Unicode where the glyphs exist, and as caret/underscore notation otherwise. """ def _repl_super_block(m: re.Match) -> str: """Convert a braced ``^{...}`` superscript span to Unicode. Maps each character through ``SUPERSCRIPT_MAP``, but bails out to a ``^{...}`` literal for any span it cannot fully render. Used as a ``re.sub`` callback inside :func:`_convert_sup_sub`. """ content = m.group(1) result = "" for ch in content: if ch in SUPERSCRIPT_MAP: result += SUPERSCRIPT_MAP[ch] elif ch.isalpha() and len(content) == 1: result += SUPERSCRIPT_MAP.get(ch, f"^{{{ch}}}") # noqa: E501 else: return f"^{{{content}}}" return result s = re.sub(r"\^\{([^{}]+)\}", _repl_super_block, s) def _repl_super_single(m: re.Match) -> str: """Convert a single-character ``^x`` superscript to Unicode. Looks the character up in ``SUPERSCRIPT_MAP``, falling back to a literal ``^x`` when no glyph exists. Used as a ``re.sub`` callback inside :func:`_convert_sup_sub`. """ return SUPERSCRIPT_MAP.get(m.group(1), f"^{m.group(1)}") s = re.sub(r"\^([0-9a-z])", _repl_super_single, s) def _repl_sub_block(m: re.Match) -> str: """Convert a braced ``_{...}`` subscript span to Unicode. Renders the span via ``SUBSCRIPT_MAP`` only when every character has a subscript glyph; otherwise preserves a ``_{...}`` literal. Used as a ``re.sub`` callback inside :func:`_convert_sup_sub`. """ content = m.group(1) if all(ch in SUBSCRIPT_MAP for ch in content): return "".join(SUBSCRIPT_MAP[ch] for ch in content) return f"_{{{content}}}" s = re.sub(r"_\{([^{}]+)\}", _repl_sub_block, s) def _repl_sub_single(m: re.Match) -> str: """Convert a single-character ``_x`` subscript to Unicode. Looks the character up in ``SUBSCRIPT_MAP``, falling back to a literal ``_x`` when no glyph exists. Used as a ``re.sub`` callback inside :func:`_convert_sup_sub`. """ return SUBSCRIPT_MAP.get(m.group(1), f"_{m.group(1)}") s = re.sub(r"_([0-9])", _repl_sub_single, s) return s # --------------------------------------------------------------------------- # Nested structures (\frac, \sqrt) # --------------------------------------------------------------------------- def _is_simple_expression(expr: str) -> bool: """Judge whether an expression is atomic enough to skip wrapping in parens. A formatting heuristic used when laying out fractions and roots: a "simple" operand (a single token, or one already parenthesized) can be inlined as-is, while anything containing division, products, big operators, or a mix of additive and sub/superscript structure must be parenthesized to stay unambiguous in linear Unicode form. Pure string inspection with no I/O. Called by :func:`_convert_nested_structures` to decide whether each ``\\frac`` numerator/denominator and ``\\sqrt`` radicand needs surrounding parentheses; it is a module-private helper. Args: expr: The already-converted sub-expression to classify. Returns: bool: ``True`` if the expression can be inlined without parentheses, ``False`` if it should be parenthesized. """ if not expr: return True expr = expr.strip() if expr.startswith("(") and expr.endswith(")"): return True if "/" in expr or "\u00d7" in expr or "\u00b7" in expr: return False if "\u2211" in expr or "\u222b" in expr or "\u220f" in expr: return False ops = "+" in expr or "-" in expr subs = "^" in expr or "_" in expr or "\u221a" in expr if ops and subs: return False if re.match(r"^[^\s+\-*/]+$", expr): return True return False def _convert_nested_structures(s: str) -> str: """Convert ``\\sqrt{...}`` and ``\\frac{...}{...}`` from the inside out. Iteratively rewrites roots as ``√(...)`` and fractions as ``num/den``, using :func:`_extract_braced` to grab balanced arguments and :func:`_is_simple_expression` to decide when each operand needs parentheses. The loop (capped at 10 passes) lets nested constructs collapse one layer per iteration until no further change occurs, so ``\\frac`` inside ``\\frac`` is handled correctly. Pure string work with no I/O. Called by :func:`_process_math` before the symbol map runs; it is a module-private helper. Args: s: The expression possibly containing nested roots and fractions. Returns: str: The expression with roots and fractions rendered in linear notation. """ max_iterations = 10 for _ in range(max_iterations): changed = False # \sqrt{...} pos = 0 result: list[str] = [] while pos < len(s): match = re.search(r"\\sqrt\{", s[pos:]) if not match: result.append(s[pos:]) break result.append(s[pos : pos + match.start()]) brace_start = pos + match.end() - 1 content, end_pos = _extract_braced(s, brace_start) if content: sq = "\u221a" simple = _is_simple_expression(content) part = content if simple else f"({content})" result.append(f"{sq}{part}") pos = end_pos changed = True else: result.append(s[pos : pos + match.end()]) pos += match.end() s = "".join(result) # \frac{...}{...} pos = 0 result = [] while pos < len(s): match = re.search(r"\\frac\{", s[pos:]) if not match: result.append(s[pos:]) break result.append(s[pos : pos + match.start()]) brace_start = pos + match.end() - 1 numerator, next_pos = _extract_braced(s, brace_start) if not numerator: result.append(s[pos : pos + match.end()]) pos += match.end() continue denominator, end_pos = _extract_braced(s, next_pos) if not denominator: result.append(s[pos : pos + match.end()]) pos += match.end() continue num_str = ( numerator if _is_simple_expression(numerator) else f"({numerator})" ) den_str = ( denominator if _is_simple_expression(denominator) else f"({denominator})" ) result.append(f"{num_str}/{den_str}") pos = end_pos changed = True s = "".join(result) if not changed: break return s # --------------------------------------------------------------------------- # Limits, sums, integrals, products # --------------------------------------------------------------------------- def _convert_limits_and_bounds(s: str) -> str: """Convert ``\\lim``, ``\\sum``, ``\\int``, and ``\\prod`` with their bounds. Rewrites limit expressions as ``lim[...]`` and the big operators to their Unicode glyphs (∑, ∫, ∏) annotated with any subscript/superscript bounds in a readable ``[lower to upper]`` form, scanning each operator manually and using :func:`_extract_bounds` to pull the attached ``_``/``^`` arguments. Done early so the bounds are captured before the generic symbol map would otherwise strip the backslash commands. Pure string work with no I/O. Called by :func:`_process_math` as the second pipeline stage; it is a module-private helper. Args: s: The expression possibly containing bounded operators. Returns: str: The expression with limits and big operators converted and their bounds inlined. """ s = re.sub(r"\\lim_\{([^{}]+)\}", lambda m: f"lim[{m.group(1)}]", s) s = re.sub(r"\\lim", "lim", s) cmds = [("\\sum", "\u2211"), ("\\int", "\u222b"), ("\\prod", "\u220f")] for cmd, symbol in cmds: pos = 0 result: list[str] = [] while pos < len(s): match = re.search(re.escape(cmd), s[pos:]) if not match: result.append(s[pos:]) break result.append(s[pos : pos + match.start()]) bound_start = pos + match.end() lower, upper, end_pos = _extract_bounds(s, bound_start) if lower and upper: result.append(f"{symbol}[{lower} to {upper}]") elif lower: result.append(f"{symbol}[{lower}]") else: result.append(symbol) pos = end_pos s = "".join(result) return s def _extract_bounds(s: str, pos: int) -> Tuple[str, str, int]: """Read the ``_lower`` and ``^upper`` bounds attached to an operator. A small scanner that, starting at *pos*, optionally consumes a subscript and then a superscript, each of which may be a braced group (via :func:`_extract_braced`), a backslash command, or a single alphanumeric character. Used to capture the bounds of ``\\sum``/``\\int``/``\\prod`` so :func:`_convert_limits_and_bounds` can render them inline. Pure string scanning with no I/O. Called only by :func:`_convert_limits_and_bounds`; it is a module-private helper. Args: s: The string being scanned. pos: Index immediately after the operator token, where bounds may begin. Returns: Tuple[str, str, int]: The lower bound, the upper bound (either may be an empty string when absent), and the index just past the consumed bounds. """ lower = "" upper = "" if pos < len(s) and s[pos] == "_": pos += 1 if pos < len(s): if s[pos] == "{": lower, pos = _extract_braced(s, pos) elif s[pos] == "\\": m = re.match(r"\\([A-Za-z]+)", s[pos:]) if m: lower = "\\" + m.group(1) pos += len(m.group(0)) elif s[pos].isalnum(): lower = s[pos] pos += 1 if pos < len(s) and s[pos] == "^": pos += 1 if pos < len(s): if s[pos] == "{": upper, pos = _extract_braced(s, pos) elif s[pos] == "\\": m = re.match(r"\\([A-Za-z]+)", s[pos:]) if m: upper = "\\" + m.group(1) pos += len(m.group(0)) elif s[pos].isalnum(): upper = s[pos] pos += 1 return lower, upper, pos # --------------------------------------------------------------------------- # Matrix environments # --------------------------------------------------------------------------- def _convert_matrix_environments(s: str) -> str: """Lay out LaTeX matrix environments as bracketed multi-line text. Matches the ``pmatrix``/``bmatrix``/``matrix``/``vmatrix``/``Vmatrix`` environments, splits their body on ``\\\\`` row breaks and ``&`` column separators, and re-emits each as a row-per-line block wrapped in the delimiter pair appropriate to the environment (parentheses, square brackets, single or double bars). Done first in the pipeline so the row/column markers are interpreted before later stages touch them. Pure string work with no I/O. Called by :func:`_process_math` as the first pipeline stage; it is a module-private helper. Args: s: The expression possibly containing matrix environments. Returns: str: The expression with matrix environments rendered as bracketed text grids. """ matrix_envs = r"(pmatrix|bmatrix|matrix|vmatrix|Vmatrix)" beg = rf"\\begin\{{{matrix_envs}\}}\s*([\s\S]*?)\s*" end = rf"\\end\{{{matrix_envs}\}}" pattern = beg + end def _repl(m: re.Match) -> str: """Format one matched matrix environment into a bracketed text grid. Splits the captured body into rows and cells and wraps the result in the delimiter pair matching the environment name. Used as the ``re.sub`` callback inside :func:`_convert_matrix_environments`. """ env = m.group(1) rows = re.split(r"\\\\", m.group(2)) fmt = [] for row in rows: cells = re.split(r"&", row.strip()) fmt.append(" ".join(c.strip() for c in cells)) body = "\n".join(fmt) if env == "pmatrix": return f"(\n{body}\n)" if env == "bmatrix": return f"[\n{body}\n]" if env == "vmatrix": return f"|\n{body}\n|" if env == "Vmatrix": return f"||\n{body}\n||" return body return re.sub(pattern, _repl, s) # --------------------------------------------------------------------------- # Final cleanup # --------------------------------------------------------------------------- def _final_cleanup(s: str) -> str: """Strip leftover LaTeX scaffolding and normalize whitespace. The last pipeline stage: it removes any surviving ``\\begin``/``\\end`` wrappers, drops the backslash from unconverted commands, deletes stray braces, and collapses redundant spaces and blank lines so the rendered math reads cleanly. Best-effort tidy-up of whatever earlier stages did not convert. Pure string work with no I/O. Called by :func:`_process_math` as the final stage; it is a module-private helper. Args: s: The mostly-converted expression to finalize. Returns: str: The cleaned, whitespace-normalized expression. """ s = re.sub(r"\\(begin|end)\{[^}]*\}", "", s) s = re.sub(r"\\([A-Za-z]+)", r"\1", s) s = re.sub(r"[{}]", "", s) s = re.sub(r"\s+\n", "\n", s) s = re.sub(r"\n{3,}", "\n\n", s) s = re.sub(r" {2,}", " ", s) return s.strip()