Source code for message_utils

"""Message utility functions for Discord bot.

Includes markdown-aware message splitting that preserves formatting across
chunk boundaries, plus helper functions for mention filtering and XML escaping.
"""

import re
from typing import List



[docs]
def escape_xml(text: str) -> str:
    """Escape XML characters in text to prevent parsing issues."""
    return (
        text.replace("&", "&amp;")
        .replace("<", "&lt;")
        .replace(">", "&gt;")
        .replace('"', "&quot;")
        .replace("'", "&apos;")
    )




[docs]
def filter_backticks_from_mentions(text: str) -> str:
    """Remove backticks surrounding Discord user mentions.

    Converts patterns like `<@82303438955753472>` to <@82303438955753472>.
    Also handles triple-backtick wrapping.
    """
    if not isinstance(text, str):
        return text
    pattern = r'`+(<@!?\d+>)`+'
    return re.sub(pattern, r'\1', text)




[docs]
def split_message(
    text: str,
    max_length: int = 1950,
    overflow_allowed: int = 45,
) -> List[str]:
    """Split a string into chunks respecting markdown formatting.

    Handles manual split markers (``{{ SPLIT_HERE }}``, ``---\\n``),
    code blocks, and active markdown formats (bold, italic, etc.).
    Formats are closed at the end of each chunk and re-opened at the
    start of the next so that Discord renders them correctly.
    """
    if not isinstance(text, str):
        return ["Error: Invalid message content"]
    if not text:
        return [""]

    hard_limit = max_length + overflow_allowed

    final_chunks: List[str] = []

    text = text.replace("---\n", "{{ SPLIT_HERE }}")
    text = text.replace('`\n\n`', '{{ SPLIT_HERE }}')
    text = text.replace('`\n`', '{{ SPLIT_HERE }}')
    manual_split_parts = text.split("{{ SPLIT_HERE }}")

    for text_part in manual_split_parts:
        text_part = text_part.strip()
        if not text_part:
            continue

        processed_text = text_part.replace("'''", "```")
        chunks = _split_with_markdown_awareness(
            processed_text, max_length, hard_limit
        )
        final_chunks.extend(c for c in chunks if c)

    return final_chunks



# ------------------------------------------------------------------
# Internal helpers
# ------------------------------------------------------------------

def _split_with_markdown_awareness(
    text: str, target_length: int, hard_limit: int,
) -> List[str]:
    """Split *text* while preserving markdown formatting."""
    chunks: List[str] = []
    current_chunk = ""
    in_code_block = False
    code_block_lang = ""
    active_formats: List[str] = []

    lines = text.split('\n')
    i = 0

    while i < len(lines):
        line = lines[i]

        if "```" in line:
            parts = line.split("```")
            for j, part in enumerate(parts):
                if j > 0:
                    if in_code_block:
                        current_chunk = current_chunk.rstrip('\n') + "\n```"
                        in_code_block = False
                        code_block_lang = ""
                        if len(current_chunk) > target_length:
                            chunks.append(current_chunk.strip())
                            current_chunk = ""
                    else:
                        in_code_block = True
                        stripped = part.strip()
                        code_block_lang = (
                            stripped.split()[0] if stripped else ""
                        )
                        if len(current_chunk) > 0:
                            current_chunk = _close_formats(
                                current_chunk, active_formats
                            )
                            chunks.append(current_chunk.strip())
                            current_chunk = ""
                        current_chunk = f"```{code_block_lang}\n"
                        active_formats.clear()
                        continue

                if part or j == 0:
                    if in_code_block:
                        current_chunk += part
                    else:
                        if part.strip():
                            current_chunk, active_formats = (
                                _add_line_with_format_tracking(
                                    current_chunk, part, active_formats,
                                    target_length, hard_limit, chunks,
                                )
                            )
            i += 1
            continue

        if in_code_block:
            potential_chunk = current_chunk + line + "\n"
            if len(potential_chunk) > target_length:
                current_chunk = current_chunk.rstrip() + "\n```"
                chunks.append(current_chunk)
                current_chunk = f"```{code_block_lang}\n{line}\n"
            else:
                current_chunk = potential_chunk
        else:
            current_chunk, active_formats = _add_line_with_format_tracking(
                current_chunk, line, active_formats,
                target_length, hard_limit, chunks,
            )

        i += 1

    if current_chunk.strip():
        if in_code_block:
            current_chunk += "\n```"
        else:
            current_chunk = _close_formats(current_chunk, active_formats)
        chunks.append(current_chunk.strip())

    return chunks


def _add_line_with_format_tracking(
    current_chunk: str,
    line: str,
    active_formats: List[str],
    target_length: int,
    hard_limit: int,
    chunks: List[str],
) -> tuple:
    """Add a line to *current_chunk* while tracking markdown formats."""
    words = line.split(' ')

    is_list_item = False
    if words and len(words) > 1:
        first_word = words[0]
        if first_word in ('*', '-', '+'):
            is_list_item = True
        elif re.match(r'^\d+\.$', first_word):
            is_list_item = True

    for idx, word in enumerate(words):
        if len(word) >= hard_limit:
            if current_chunk:
                current_chunk = _close_formats(current_chunk, active_formats)
                chunks.append(current_chunk.strip())
                current_chunk = ""
                current_chunk = _open_formats(current_chunk, active_formats)
            for j in range(0, len(word), hard_limit):
                chunks.append(word[j:j + hard_limit])
            current_chunk = ""
            current_chunk = _open_formats(current_chunk, active_formats)
            continue

        format_overhead = sum(len(tag) for tag in active_formats) * 2
        space = (
            " " if current_chunk
            and not current_chunk.endswith('\n') else ""
        )
        potential_length = (
            len(current_chunk) + len(space) + len(word) + format_overhead
        )

        if potential_length > target_length and current_chunk:
            current_chunk = _close_formats(current_chunk, active_formats)
            if len(current_chunk) > hard_limit:
                current_chunk = current_chunk[:hard_limit]
            chunks.append(current_chunk.strip())
            current_chunk = ""
            current_chunk = _open_formats(current_chunk, active_formats)

        if current_chunk and not current_chunk.endswith('\n'):
            current_chunk += " "
        current_chunk += word

        if not (is_list_item and idx == 0):
            active_formats = _update_formats_from_word(word, active_formats)

    current_chunk += "\n"
    return current_chunk, active_formats


def _update_formats_from_word(
    word: str, active_formats: List[str]
) -> List[str]:
    """Update active markdown formats based on markers found in *word*.

    Handles ``**``, ``__``, ``*``, ``_``, ``~~``, ``||``,
    and inline code
    (including multi-backtick sequences).  Single ``*`` and ``_`` are only
    counted when they appear at word boundaries to avoid false positives on
    identifiers like ``some_variable``.
    """
    formats = active_formats.copy()

    active_inline_code = None
    for fmt in formats:
        if fmt.startswith('INLINE_CODE_'):
            active_inline_code = fmt
            break

    i = 0
    while i < len(word):
        if word[i] == '`':
            backtick_count = 0
            j = i
            while j < len(word) and word[j] == '`':
                backtick_count += 1
                j += 1
            code_marker = f'INLINE_CODE_{backtick_count}'
            if active_inline_code:
                if code_marker == active_inline_code:
                    formats.remove(active_inline_code)
                    active_inline_code = None
            else:
                formats.append(code_marker)
                active_inline_code = code_marker
            i = j
        else:
            i += 1

    if active_inline_code:
        return formats

    word_without_code = re.sub(r'`+[^`]*`+', '', word)
    if not word_without_code.strip():
        return formats

    for tag in ["**", "__", "~~", "||"]:
        count = word_without_code.count(tag)
        for _ in range(count):
            if tag in formats:
                formats.remove(tag)
            else:
                formats.append(tag)

    temp_word = word_without_code.replace("**", "").replace("__", "")

    for tag in ["*", "_"]:
        positions = [i for i, c in enumerate(temp_word) if c == tag]
        for pos in positions:
            at_start = pos == 0
            at_end = pos == len(temp_word) - 1
            before_is_boundary = at_start or not temp_word[pos - 1].isalnum()
            after_is_boundary = at_end or not temp_word[pos + 1].isalnum()
            if before_is_boundary or after_is_boundary:
                if tag in formats:
                    formats.remove(tag)
                else:
                    formats.append(tag)

    return formats


def _close_formats(chunk: str, active_formats: List[str]) -> str:
    """Close all active markdown formats in reverse order."""
    for tag in reversed(active_formats):
        if tag.startswith('INLINE_CODE_'):
            n = int(tag.split('_')[2])
            chunk += '`' * n
        else:
            chunk += tag
    return chunk


def _open_formats(chunk: str, active_formats: List[str]) -> str:
    """Open all active markdown formats."""
    for tag in active_formats:
        if tag.startswith('INLINE_CODE_'):
            n = int(tag.split('_')[2])
            chunk += '`' * n
        else:
            chunk += tag
    return chunk