Source code for latex_converter

"""LaTeX to Discord Formatting Converter.

Converts LaTeX/math formatting to Discord-friendly text with Unicode symbols.

Strategy:
- Preserve fenced and inline code as-is.
- Convert display math to plain text (no code fences).
- Convert inline math to inline code with lightweight symbol mapping.
- Replace common LaTeX commands with Unicode equivalents.
- Handle nested ``\\frac{}{}``, ``\\sqrt{}``, superscript/subscript to Unicode.
- Support matrix environments and special math fonts.
"""

import re
import logging
from typing import Tuple

logger = logging.getLogger(__name__)

# ---------------------------------------------------------------------------
# Unicode mapping tables (module-level for reuse)
# ---------------------------------------------------------------------------

LATEX_TO_UNICODE = {
    # Greek lowercase
    "alpha": "\u03b1", "beta": "\u03b2", "gamma": "\u03b3", "delta": "\u03b4",
    "epsilon": "\u03b5", "zeta": "\u03b6", "eta": "\u03b7", "theta": "\u03b8",
    "iota": "\u03b9", "kappa": "\u03ba", "lambda": "\u03bb", "mu": "\u03bc",
    "nu": "\u03bd", "xi": "\u03be", "pi": "\u03c0", "rho": "\u03c1",
    "sigma": "\u03c3", "tau": "\u03c4", "upsilon": "\u03c5", "phi": "\u03c6",
    "chi": "\u03c7", "psi": "\u03c8", "omega": "\u03c9",
    # Greek uppercase
    "Gamma": "\u0393", "Delta": "\u0394", "Theta": "\u0398",
    "Lambda": "\u039b",
    "Xi": "\u039e", "Pi": "\u03a0", "Sigma": "\u03a3", "Upsilon": "\u03a5",
    "Phi": "\u03a6", "Psi": "\u03a8",
    "Omega": "\u03a9",
    # Greek variants
    "varepsilon": "\u03b5", "vartheta": "\u03d1", "varpi": "\u03d6",
    "varrho": "\u03f1", "varsigma": "\u03c2", "varphi": "\u03c6",
    # Operators and misc
    "times": "\u00d7", "cdot": "\u00b7", "pm": "\u00b1", "mp": "\u2213",
    "leq": "\u2264", "geq": "\u2265", "neq": "\u2260", "approx": "\u2248",
    "sim": "\u223c", "infty": "\u221e", "infinity": "\u221e", "to": "\u2192",
    "rightarrow": "\u2192", "leftarrow": "\u2190", "Rightarrow": "\u21d2",
    "Leftarrow": "\u21d0", "Leftrightarrow": "\u21d4",
    "partial": "\u2202", "nabla": "\u2207", "sum": "\u2211", "prod": "\u220f",
    "int": "\u222b", "forall": "\u2200", "exists": "\u2203", "neg": "\u00ac",
    "lor": "\u2228", "land": "\u2227", "oplus": "\u2295",
    "otimes": "\u2297",
    # Set theory
    "in": "\u2208", "notin": "\u2209", "subset": "\u2282", "supset": "\u2283",
    "subseteq": "\u2286", "supseteq": "\u2287", "cup": "\u222a",
    "cap": "\u2229",
    "emptyset": "\u2205", "varnothing": "\u2205", "setminus": "\u2216",
    "complement": "\u2201",
    # Logic
    "implies": "\u21d2", "iff": "\u21d4", "equiv": "\u2261",
    "therefore": "\u2234", "because": "\u2235",
    # Relations
    "cong": "\u2245", "ncong": "\u2247", "propto": "\u221d", "prec": "\u227a",
    "succ": "\u227b", "preceq": "\u2aaf", "succeq": "\u2ab0",
    "parallel": "\u2225",
    "perp": "\u22a5", "asymp": "\u224d",
    # Dots
    "ldots": "...", "cdots": "\u22ef", "vdots": "\u22ee", "ddots": "\u22f1",
    "dots": "...",
    # Brackets and delimiters
    "langle": "\u27e8", "rangle": "\u27e9", "lfloor": "\u230a",
    "rfloor": "\u230b",
    "lceil": "\u2308", "rceil": "\u2309", "lbrace": "{", "rbrace": "}",
    # Trigonometric functions (preserve as-is)
    "sin": "sin", "cos": "cos", "tan": "tan", "cot": "cot",
    "sec": "sec", "csc": "csc",
    "arcsin": "arcsin", "arccos": "arccos", "arctan": "arctan",
    "sinh": "sinh", "cosh": "cosh", "tanh": "tanh",
    "coth": "coth", "sech": "sech", "csch": "csch",
    # Logarithmic and exponential
    "ln": "ln", "log": "log", "exp": "exp", "lg": "lg",
    # Mathematical functions
    "det": "det", "dim": "dim", "ker": "ker", "deg": "deg",
    "gcd": "gcd", "lcm": "lcm", "arg": "arg", "hom": "hom",
    "Pr": "Pr", "mod": "mod",
    # Limits and bounds
    "lim": "lim", "sup": "sup", "inf": "inf", "max": "max", "min": "min",
    "limsup": "lim sup",
    "liminf": "lim inf",
    # Other symbols
    "ell": "\u2113", "hbar": "\u210f", "Re": "\u211c",
    "Im": "\u2111",
    "wp": "\u2118", "angle": "\u2220", "triangle": "\u25b3",
    "square": "\u25a1",
    "diamond": "\u25c7", "star": "\u22c6", "dagger": "\u2020",
    "ddagger": "\u2021",
}

MATHBB_MAP = {
    "R": "\u211d", "C": "\u2102", "N": "\u2115", "Z": "\u2124",
    "Q": "\u211a", "P": "\u2119", "A": "\U0001d538", "E": "\U0001d53c",
    "H": "\u210d",
}

MATHCAL_MAP = {
    "A": "\U0001d49c", "B": "\U0001d435", "C": "\U0001d49e", "D": "\U0001d49f",
    "E": "\U0001d438", "F": "\U0001d439", "G": "\U0001d4a2", "H": "\U0001d43b",
    "I": "\U0001d43c", "J": "\U0001d4a5", "K": "\U0001d4a6", "L": "\U0001d43f",
    "M": "\U0001d440", "N": "\U0001d4a9", "O": "\U0001d4aa", "P": "\U0001d4ab",
    "Q": "\U0001d4ac", "R": "\U0001d445", "S": "\U0001d4ae", "T": "\U0001d4af",
    "U": "\U0001d4b0", "V": "\U0001d4b1", "W": "\U0001d4b2", "X": "\U0001d4b3",
    "Y": "\U0001d4b4", "Z": "\U0001d4b5",
}

SUPERSCRIPT_MAP = {
    "0": "\u2070", "1": "\u00b9", "2": "\u00b2", "3": "\u00b3",
    "4": "\u2074", "5": "\u2075", "6": "\u2076", "7": "\u2077",
    "8": "\u2078", "9": "\u2079", "+": "\u207a", "-": "\u207b",
    "(": "\u207d", ")": "\u207e", "=": "\u207c", "n": "\u207f",
    "i": "\u2071",
}

SUBSCRIPT_MAP = {
    "0": "\u2080", "1": "\u2081", "2": "\u2082", "3": "\u2083",
    "4": "\u2084", "5": "\u2085", "6": "\u2086", "7": "\u2087",
    "8": "\u2088", "9": "\u2089", "+": "\u208a", "-": "\u208b",
    "(": "\u208d", ")": "\u208e", "=": "\u208c",
}


# ---------------------------------------------------------------------------
# Public API
# ---------------------------------------------------------------------------

[docs] def convert_latex_to_discord(text: str) -> str: r"""Convert common LaTeX/math formatting to Discord-friendly text. Preserves fenced code blocks and inline code spans untouched. Everything else is scanned for LaTeX delimiters and converted using Unicode symbols. """ if not isinstance(text, str) or not text: return text or "" if not re.search(r"(\\\[|\\\(|\\begin\{|\$\$|\$)", text): return text try: return _do_convert(text) except Exception as e: logger.warning("LaTeX conversion failed: %s", e) return text
# --------------------------------------------------------------------------- # Internal implementation # --------------------------------------------------------------------------- def _do_convert(text: str) -> str: """Top-level driver: split on fenced code blocks, process segments.""" out_parts: list[str] = [] fence_pattern = re.compile(r"```[\s\S]*?```", re.MULTILINE) last_end = 0 for m in fence_pattern.finditer(text): before = text[last_end:m.start()] if before: out_parts.append(_transform_non_code(before)) out_parts.append(m.group(0)) last_end = m.end() tail = text[last_end:] if tail: out_parts.append(_transform_non_code(tail)) return "".join(out_parts) def _transform_non_code(segment: str) -> str: """Transform a segment that is NOT inside a fenced code block. Inline code spans (single backtick) are preserved as-is. """ pieces: list[str] = [] idx = 0 for im in re.finditer(r"`[^`]*`", segment): part = segment[idx:im.start()] part = _transform_text_styles(part) part = _transform_math_segments(part) pieces.append(part) pieces.append(im.group(0)) idx = im.end() part = segment[idx:] part = _transform_text_styles(part) part = _transform_math_segments(part) pieces.append(part) return "".join(pieces) # --------------------------------------------------------------------------- # Math segment transforms # --------------------------------------------------------------------------- def _transform_math_segments(s: str) -> str: # Display math: \[...\] """Internal helper: transform math segments. Args: s (str): The s value. Returns: str: Result string. """ s = re.sub( r"\\\[\s*([\s\S]*?)\s*\\\]", lambda m: _process_math(m.group(1).strip()), s, ) # equation, align, gather, multline environments s = re.sub( r"\\begin\{(equation\*?|align\*?|gather\*?|multline\*?)\}" r"\s*([\s\S]*?)\s*\\end\{\1\}", lambda m: _process_math(m.group(2).strip()), s, ) # $$...$$ s = re.sub( r"\$\$\s*([\s\S]*?)\s*\$\$", lambda m: _process_math(m.group(1).strip()), s, ) # Inline \(...\) s = re.sub( r"\\\(\s*([\s\S]*?)\s*\\\)", lambda m: f"`{_process_math(m.group(1))}`", s, ) # Inline $...$ s = re.sub(r"\$(.+?)\$", _repl_inline_dollar, s) return s def _repl_inline_dollar(m: re.Match) -> str: """Decide whether ``$...$`` is inline math or a currency/prose literal.""" inner = m.group(1) stripped = inner.strip() if re.match(r"^\d+(\.\d+)?$", stripped): return m.group(0) # Currency pattern: starts with a digit, contains spaces, no LaTeX cmds if re.match(r"^\d", stripped) and " " in inner and "\\" not in inner: return m.group(0) # Natural-language content with spaces and no LaTeX commands if " " in inner and "\\" not in inner and not re.search(r"[=^_<>{}]", inner): return m.group(0) skip_chars = ["\\", "^", "_", "="] if " or " in inner.lower() and not any(c in inner for c in skip_chars): return m.group(0) math_ops = ["^", "_", "=", "+", "-", "*", "/", "<", ">"] has_math = ( "\\" in inner or any(op in inner for op in math_ops) ) if has_math: return f"`{_process_math(inner)}`" return m.group(0) def _transform_text_styles(s: str) -> str: """Internal helper: transform text styles. Args: s (str): The s value. Returns: str: Result string. """ s = re.sub(r"\\(?:textbf|mathbf)\{([^{}]+)\}", r"**\1**", s) s = re.sub(r"\\(?:textit|mathit|emph)\{([^{}]+)\}", r"*\1*", s) s = re.sub(r"\\underline\{([^{}]+)\}", r"__\1__", s) s = re.sub(r"\\(?:texttt|mathtt)\{([^{}]+)\}", r"`\1`", s) return s # --------------------------------------------------------------------------- # Core math processing pipeline # --------------------------------------------------------------------------- def _process_math(content: str) -> str: """Run all conversion stages in the correct order.""" content = _convert_matrix_environments(content) content = _convert_limits_and_bounds(content) content = _convert_nested_structures(content) content = _apply_symbol_map(content) content = _convert_sup_sub(content) content = _final_cleanup(content) return content # --------------------------------------------------------------------------- # Brace extraction # --------------------------------------------------------------------------- def _extract_braced(s: str, start: int) -> Tuple[str, int]: """Extract balanced ``{...}``. Returns ``('', start)`` on failure.""" if start >= len(s) or s[start] != "{": return "", start depth = 0 pos = start while pos < len(s): if s[pos] == "{": depth += 1 elif s[pos] == "}": depth -= 1 if depth == 0: return s[start + 1:pos], pos + 1 elif s[pos] == "\\" and pos + 1 < len(s): pos += 1 pos += 1 return "", start # --------------------------------------------------------------------------- # Symbol map + accents # --------------------------------------------------------------------------- def _apply_symbol_map(s: str) -> str: """Internal helper: apply symbol map. Args: s (str): The s value. Returns: str: Result string. """ def _repl_mathbb(m: re.Match) -> str: """Internal helper: repl mathbb. Args: m (re.Match): The m value. Returns: str: Result string. """ return MATHBB_MAP.get( m.group(1), f"\U0001d539\U0001d539[{m.group(1)}]") def _repl_mathcal(m: re.Match) -> str: """Internal helper: repl mathcal. Args: m (re.Match): The m value. Returns: str: Result string. """ return MATHCAL_MAP.get( m.group(1), f"\U0001d4d2\U0001d4d0\U0001d4db[{m.group(1)}]") s = re.sub(r"\\mathbb\{([A-Z])\}", _repl_mathbb, s) s = re.sub(r"\\mathcal\{([A-Z])\}", _repl_mathcal, s) s = re.sub( r"\\mathfrak\{([^{}]+)\}", lambda m: f"\U0001d509\U0001d52f\U0001d51e\U0001d528[{m.group(1)}]", s) # Accents s = re.sub(r"\\hat\{([^{}]+)\}", "\\1\u0302", s) s = re.sub(r"\\bar\{([^{}]+)\}", "\\1\u0304", s) s = re.sub(r"\\tilde\{([^{}]+)\}", "\\1\u0303", s) s = re.sub(r"\\vec\{([^{}]+)\}", "\\1\u20d7", s) def _repl_dot(m: re.Match) -> str: """Internal helper: repl dot. Args: m (re.Match): The m value. Returns: str: Result string. """ c = m.group(1) return (c + "\u0307") if len(c) == 1 and c.isalpha() else f"{c}-dot" def _repl_ddot(m: re.Match) -> str: """Internal helper: repl ddot. Args: m (re.Match): The m value. Returns: str: Result string. """ c = m.group(1) return (c + "\u0308") if len(c) == 1 and c.isalpha() else f"{c}-ddot" s = re.sub(r"\\dot\{([^{}]+)\}", _repl_dot, s) s = re.sub(r"\\ddot\{([^{}]+)\}", _repl_ddot, s) # Binomial / modulo s = re.sub( r"\\binom\{([^{}]+)\}\{([^{}]+)\}", lambda m: f"C({m.group(1)},{m.group(2)})", s, ) s = re.sub(r"\\pmod\{([^{}]+)\}", r"(mod \1)", s) # Literal braces s = s.replace(r"\{", "{").replace(r"\}", "}") # Text wrappers s = re.sub(r"\\text\(([^()]+)\)", r"\1", s) s = re.sub(r"\\mathbf\{([^{}]+)\}", r"**\1**", s) s = re.sub( r"\\(?:text|mathrm|textrm|mathit|mathsf|mathtt)\{([^{}]+)\}", r"\1", s, ) s = s.replace("\\\\", "\n") s = re.sub(r"\\left\s*", "", s) s = re.sub(r"\\right\s*", "", s) def _repl_cmd(m: re.Match) -> str: """Internal helper: repl cmd. Args: m (re.Match): The m value. Returns: str: Result string. """ return LATEX_TO_UNICODE.get(m.group(1), m.group(0)) s = re.sub(r"\\([A-Za-z]+)", _repl_cmd, s) return s # --------------------------------------------------------------------------- # Superscript / subscript # --------------------------------------------------------------------------- def _convert_sup_sub(s: str) -> str: """Internal helper: convert sup sub. Args: s (str): The s value. Returns: str: Result string. """ def _repl_super_block(m: re.Match) -> str: """Internal helper: repl super block. Args: m (re.Match): The m value. Returns: str: Result string. """ content = m.group(1) result = "" for ch in content: if ch in SUPERSCRIPT_MAP: result += SUPERSCRIPT_MAP[ch] elif ch.isalpha() and len(content) == 1: result += SUPERSCRIPT_MAP.get( ch, f"^{{{ch}}}") # noqa: E501 else: return f"^{{{content}}}" return result s = re.sub(r"\^\{([^{}]+)\}", _repl_super_block, s) def _repl_super_single(m: re.Match) -> str: """Internal helper: repl super single. Args: m (re.Match): The m value. Returns: str: Result string. """ return SUPERSCRIPT_MAP.get( m.group(1), f"^{m.group(1)}") s = re.sub(r"\^([0-9a-z])", _repl_super_single, s) def _repl_sub_block(m: re.Match) -> str: """Internal helper: repl sub block. Args: m (re.Match): The m value. Returns: str: Result string. """ content = m.group(1) if all(ch in SUBSCRIPT_MAP for ch in content): return "".join(SUBSCRIPT_MAP[ch] for ch in content) return f"_{{{content}}}" s = re.sub(r"_\{([^{}]+)\}", _repl_sub_block, s) def _repl_sub_single(m: re.Match) -> str: """Internal helper: repl sub single. Args: m (re.Match): The m value. Returns: str: Result string. """ return SUBSCRIPT_MAP.get( m.group(1), f"_{m.group(1)}") s = re.sub(r"_([0-9])", _repl_sub_single, s) return s # --------------------------------------------------------------------------- # Nested structures (\frac, \sqrt) # --------------------------------------------------------------------------- def _is_simple_expression(expr: str) -> bool: """Internal helper: is simple expression. Args: expr (str): The expr value. Returns: bool: True on success, False otherwise. """ if not expr: return True expr = expr.strip() if expr.startswith("(") and expr.endswith(")"): return True if "/" in expr or "\u00d7" in expr or "\u00b7" in expr: return False if "\u2211" in expr or "\u222b" in expr or "\u220f" in expr: return False ops = ("+" in expr or "-" in expr) subs = ("^" in expr or "_" in expr or "\u221a" in expr) if ops and subs: return False if re.match(r"^[^\s+\-*/]+$", expr): return True return False def _convert_nested_structures(s: str) -> str: """Internal helper: convert nested structures. Args: s (str): The s value. Returns: str: Result string. """ max_iterations = 10 for _ in range(max_iterations): changed = False # \sqrt{...} pos = 0 result: list[str] = [] while pos < len(s): match = re.search(r"\\sqrt\{", s[pos:]) if not match: result.append(s[pos:]) break result.append(s[pos:pos + match.start()]) brace_start = pos + match.end() - 1 content, end_pos = _extract_braced(s, brace_start) if content: sq = "\u221a" simple = _is_simple_expression(content) part = content if simple else f"({content})" result.append(f"{sq}{part}") pos = end_pos changed = True else: result.append(s[pos:pos + match.end()]) pos += match.end() s = "".join(result) # \frac{...}{...} pos = 0 result = [] while pos < len(s): match = re.search(r"\\frac\{", s[pos:]) if not match: result.append(s[pos:]) break result.append(s[pos:pos + match.start()]) brace_start = pos + match.end() - 1 numerator, next_pos = _extract_braced(s, brace_start) if not numerator: result.append(s[pos:pos + match.end()]) pos += match.end() continue denominator, end_pos = _extract_braced(s, next_pos) if not denominator: result.append(s[pos:pos + match.end()]) pos += match.end() continue num_str = (numerator if _is_simple_expression(numerator) else f"({numerator})") den_str = (denominator if _is_simple_expression(denominator) else f"({denominator})") result.append(f"{num_str}/{den_str}") pos = end_pos changed = True s = "".join(result) if not changed: break return s # --------------------------------------------------------------------------- # Limits, sums, integrals, products # --------------------------------------------------------------------------- def _convert_limits_and_bounds(s: str) -> str: """Internal helper: convert limits and bounds. Args: s (str): The s value. Returns: str: Result string. """ s = re.sub(r"\\lim_\{([^{}]+)\}", lambda m: f"lim[{m.group(1)}]", s) s = re.sub(r"\\lim", "lim", s) cmds = [("\\sum", "\u2211"), ("\\int", "\u222b"), ("\\prod", "\u220f")] for cmd, symbol in cmds: pos = 0 result: list[str] = [] while pos < len(s): match = re.search(re.escape(cmd), s[pos:]) if not match: result.append(s[pos:]) break result.append(s[pos:pos + match.start()]) bound_start = pos + match.end() lower, upper, end_pos = _extract_bounds(s, bound_start) if lower and upper: result.append(f"{symbol}[{lower} to {upper}]") elif lower: result.append(f"{symbol}[{lower}]") else: result.append(symbol) pos = end_pos s = "".join(result) return s def _extract_bounds(s: str, pos: int) -> Tuple[str, str, int]: """Extract ``_subscript`` and ``^superscript`` starting at *pos*.""" lower = "" upper = "" if pos < len(s) and s[pos] == "_": pos += 1 if pos < len(s): if s[pos] == "{": lower, pos = _extract_braced(s, pos) elif s[pos] == "\\": m = re.match(r"\\([A-Za-z]+)", s[pos:]) if m: lower = "\\" + m.group(1) pos += len(m.group(0)) elif s[pos].isalnum(): lower = s[pos] pos += 1 if pos < len(s) and s[pos] == "^": pos += 1 if pos < len(s): if s[pos] == "{": upper, pos = _extract_braced(s, pos) elif s[pos] == "\\": m = re.match(r"\\([A-Za-z]+)", s[pos:]) if m: upper = "\\" + m.group(1) pos += len(m.group(0)) elif s[pos].isalnum(): upper = s[pos] pos += 1 return lower, upper, pos # --------------------------------------------------------------------------- # Matrix environments # --------------------------------------------------------------------------- def _convert_matrix_environments(s: str) -> str: """Internal helper: convert matrix environments. Args: s (str): The s value. Returns: str: Result string. """ matrix_envs = r"(pmatrix|bmatrix|matrix|vmatrix|Vmatrix)" beg = rf"\\begin\{{{matrix_envs}\}}\s*([\s\S]*?)\s*" end = rf"\\end\{{{matrix_envs}\}}" pattern = beg + end def _repl(m: re.Match) -> str: """Internal helper: repl. Args: m (re.Match): The m value. Returns: str: Result string. """ env = m.group(1) rows = re.split(r"\\\\", m.group(2)) fmt = [] for row in rows: cells = re.split(r"&", row.strip()) fmt.append(" ".join(c.strip() for c in cells)) body = "\n".join(fmt) if env == "pmatrix": return f"(\n{body}\n)" if env == "bmatrix": return f"[\n{body}\n]" if env == "vmatrix": return f"|\n{body}\n|" if env == "Vmatrix": return f"||\n{body}\n||" return body return re.sub(pattern, _repl, s) # --------------------------------------------------------------------------- # Final cleanup # --------------------------------------------------------------------------- def _final_cleanup(s: str) -> str: """Internal helper: final cleanup. Args: s (str): The s value. Returns: str: Result string. """ s = re.sub(r"\\(begin|end)\{[^}]*\}", "", s) s = re.sub(r"\\([A-Za-z]+)", r"\1", s) s = re.sub(r"[{}]", "", s) s = re.sub(r"\s+\n", "\n", s) s = re.sub(r"\n{3,}", "\n\n", s) s = re.sub(r" {2,}", " ", s) return s.strip()