Source code for platforms.emoji_resolver

"""Resolve custom emojis from Discord and Matrix into image Attachments.

Custom emojis are platform-specific rich content that arrives as text
tokens (Discord ``<:name:id>``) or HTML ``<img>`` tags (Matrix).  This
module extracts them from message text, downloads the images, and
returns :class:`~platforms.base.Attachment` objects so the LLM can
*see* the emojis as inline PNG images.

Both Discord and Matrix adapters call into these shared utilities from
their ``on_message`` handlers.
"""

from __future__ import annotations

import asyncio
import logging
import re
from dataclasses import dataclass
from typing import Any

import aiohttp

from platforms.base import Attachment
from platforms.media_common import download_with_retry

logger = logging.getLogger(__name__)

# ------------------------------------------------------------------
# Discord emoji extraction
# ------------------------------------------------------------------

# Matches  <:name:id>  and  <a:name:id>  (animated)
DISCORD_EMOJI_RE = re.compile(
    r"<(a?):(\w{2,32}):(\d{17,20})>",
)

_DISCORD_CDN = "https://cdn.discordapp.com/emojis"


[docs] @dataclass(frozen=True) class DiscordEmojiMatch: """An immutable record of one custom emoji parsed from Discord text. Captures everything needed to both download a custom emoji's image and rewrite the original ``<:name:id>`` (or animated ``<a:name:id>``) token in the message body. Instances are produced by :func:`extract_discord_emojis` (which deduplicates by id) and consumed by :func:`rewrite_discord_emoji_text` and :func:`_download_one_discord_emoji`; being frozen, they are safe to share across the parallel download fan-out. Attributes: name: The emoji's short name (e.g. ``"blobwave"``). emoji_id: The numeric Discord emoji id used to build the CDN URL. animated: ``True`` for animated emojis (``<a:...>``), selecting a ``.gif`` rather than ``.png`` download. full_match: The original token text as it appeared in the message, used for the in-place text replacement. """ name: str emoji_id: str animated: bool full_match: str # the original ``<:name:id>`` token
[docs] def extract_discord_emojis(text: str) -> list[DiscordEmojiMatch]: """Find the unique custom emojis in a message, in first-seen order. Scans the text for Discord custom-emoji tokens (``<:name:id>`` and the animated ``<a:name:id>`` form) using :data:`DISCORD_EMOJI_RE` and returns one :class:`DiscordEmojiMatch` per distinct emoji id, so a repeated emoji is downloaded and rewritten only once. This is the entry point of the resolve-emojis-as-images flow, telling the caller both what to fetch and what to rewrite. Pure string processing with no I/O. Called by the Discord bot adapter (``platforms/discord.py``) and the selfbot adapter (``platforms/discord_self.py``) in their ``on_message`` paths when ``resolve_emojis_as_images`` is enabled; also exercised by ``tests/test_emoji_resolver.py``. Args: text: The raw message body to scan for custom-emoji tokens. Returns: list[DiscordEmojiMatch]: One match per unique emoji id, ordered by first appearance; empty when the text contains no custom emojis. """ seen: set[str] = set() results: list[DiscordEmojiMatch] = [] for m in DISCORD_EMOJI_RE.finditer(text): emoji_id = m.group(3) if emoji_id in seen: continue seen.add(emoji_id) results.append( DiscordEmojiMatch( name=m.group(2), emoji_id=emoji_id, animated=bool(m.group(1)), full_match=m.group(0), ) ) return results
[docs] def rewrite_discord_emoji_text( text: str, matches: list[DiscordEmojiMatch], ) -> str: """Swap raw custom-emoji tokens for readable ``[emoji: name]`` placeholders. Rewrites the message body so that the opaque ``<:name:id>`` tokens become human- and LLM-legible ``[emoji: name]`` markers, which is paired with attaching the emoji images: the model reads the name in context while also seeing the actual picture as an inline attachment. Replaces by exact token text (``full_match``) so only the emojis the caller chose to resolve are touched. Pure string processing with no I/O. Called by the Discord bot adapter (``platforms/discord.py``) and the selfbot adapter (``platforms/discord_self.py``) right after their emoji images are downloaded; also covered by ``tests/test_emoji_resolver.py``. Args: text: The message body to rewrite. matches: The emojis to replace (typically the subset that was actually downloaded as images). Returns: str: The text with each matched token replaced by ``[emoji: name]``. """ for em in matches: text = text.replace(em.full_match, f"[emoji: {em.name}]") return text
async def _download_one_discord_emoji( em: DiscordEmojiMatch, media_cache: Any | None = None, ) -> Attachment | None: """Download one Discord custom emoji image into an :class:`Attachment`. Builds the Discord CDN URL from the emoji's id and animated flag (``.gif`` for animated, ``.png`` otherwise), fetches the bytes, and wraps them in a :class:`~platforms.base.Attachment` so the emoji can be shown to the LLM as an inline image. Failures never abort the surrounding batch: any error is logged at warning level and ``None`` is returned. Performs an HTTP GET against the Discord CDN via the inner ``_fetch_emoji`` closure and :func:`platforms.media_common.download_with_retry` (which retries transient blips but re-raises a genuine 404/403). When a ``media_cache`` is supplied the network is only hit on a cache miss via ``media_cache.get_or_download``. Called only by :func:`download_discord_emojis`, which fans these out with :func:`asyncio.gather`; no other callers. Args: em: The :class:`DiscordEmojiMatch` describing the emoji to fetch. media_cache: Optional cache exposing ``get_or_download`` to dedupe and reuse previously fetched emoji bytes. Returns: Attachment | None: The downloaded emoji as an attachment, or ``None`` if the download failed. """ ext = "gif" if em.animated else "png" url = f"{_DISCORD_CDN}/{em.emoji_id}.{ext}" mimetype = f"image/{ext}" filename = f"{em.name}_{em.emoji_id}.{ext}" try: async def _fetch_emoji() -> tuple[bytes, str, str]: """Fetch the emoji's raw bytes from the Discord CDN (one HTTP GET). Captures ``url``/``mimetype``/``filename`` from the enclosing scope and raises ``aiohttp.ClientResponseError`` on a non-2xx status. """ async with aiohttp.ClientSession() as session: async with session.get(url) as resp: resp.raise_for_status() data = await resp.read() return data, mimetype, filename async def _download() -> tuple[bytes, str, str]: """Download the emoji with bounded retry on transient CDN blips. Passed to ``media_cache.get_or_download`` so the network (with retry) is only invoked on a cache miss. A genuine 404/403 re-raises immediately without retry. """ return await download_with_retry(_fetch_emoji, label=filename) if media_cache is not None: data, mimetype, filename = await media_cache.get_or_download( url, _download, ) else: data, mimetype, filename = await _download() return Attachment( data=data, mimetype=mimetype, filename=filename, source_url=url, ) except Exception: logger.warning( "Failed to download Discord emoji %s (%s)", em.name, url, exc_info=True, ) return None
[docs] async def download_discord_emojis( matches: list[DiscordEmojiMatch], *, max_emojis: int = 5, media_cache: Any | None = None, ) -> list[Attachment]: """Download up to ``max_emojis`` Discord custom emojis concurrently. Caps the work at ``max_emojis`` (to bound how many images a single message can pull) and fetches the chosen emojis in parallel, returning only the ones that succeeded so a few bad emojis never sink the batch. This is the download half of the resolve-emojis-as-images flow whose matches come from :func:`extract_discord_emojis`. Fans out one :func:`_download_one_discord_emoji` task per emoji via :func:`asyncio.gather` (with ``return_exceptions=True`` so a failure is isolated, not raised), threading the optional ``media_cache`` through to each so cached emoji bytes are reused. Called by the Discord bot adapter (``platforms/discord.py``) and the selfbot adapter (``platforms/discord_self.py``) when ``resolve_emojis_as_images`` is enabled; no internal callers. Args: matches: Candidate emojis to download (usually from :func:`extract_discord_emojis`). max_emojis: Maximum number of emojis to fetch from the front of ``matches`` (default ``5``). media_cache: Optional cache exposing ``get_or_download`` to dedupe repeated downloads. Returns: list[Attachment]: The successfully downloaded emoji attachments; failures and the over-the-limit tail are omitted. """ bounded = matches[:max_emojis] if not bounded: return [] results = await asyncio.gather( *[_download_one_discord_emoji(em, media_cache) for em in bounded], return_exceptions=True, ) return [r for r in results if isinstance(r, Attachment)]
# ------------------------------------------------------------------ # Matrix emoji extraction # ------------------------------------------------------------------ # Matches <img ... src="mxc://..." ...> in Matrix formatted_body HTML. # Matrix custom emojis use data-mx-emoticon attribute and mxc:// src. _MATRIX_EMOJI_IMG_RE = re.compile( r'<img\b[^>]*?\bsrc="(mxc://[^"]+)"[^>]*?>', re.IGNORECASE, ) # Extract alt text from an <img> tag _MATRIX_IMG_ALT_RE = re.compile( r'\balt="([^"]*)"', re.IGNORECASE, )
[docs] @dataclass(frozen=True) class MatrixEmojiMatch: """An immutable record of one custom emoji parsed from Matrix HTML. Captures what is needed to both download a Matrix custom emoji and rewrite its shortcode in the plain-text body. Instances are produced by :func:`extract_matrix_emojis` (deduplicated by ``mxc://`` URL) and consumed by :func:`rewrite_matrix_emoji_text` and :func:`download_matrix_emojis`; being frozen, they are safe to share across the parallel download fan-out. Attributes: alt_text: The ``alt`` text of the source ``<img>`` tag (often a ``:shortcode:``), used both as the rewrite label and to derive a filename. mxc_url: The ``mxc://`` content URI used to download the image from the Matrix homeserver. full_tag: The complete original ``<img ...>`` tag as it appeared in the formatted body. """ alt_text: str mxc_url: str full_tag: str # the complete ``<img ...>`` tag
[docs] def extract_matrix_emojis(formatted_body: str) -> list[MatrixEmojiMatch]: """Find the unique custom emojis embedded in a Matrix HTML body. Scans a Matrix message's ``formatted_body`` for inline emoji ``<img>`` tags whose ``src`` is an ``mxc://`` URI (using :data:`_MATRIX_EMOJI_IMG_RE`), pulling the ``alt`` text from each (falling back to ``"emoji"``) and deduplicating by URI so each emoji is fetched and rewritten once. This is the Matrix-side entry point of the resolve-emojis-as-images flow. Pure string processing with no I/O. Called by the Matrix adapter (``platforms/matrix.py``) in its message path when emoji resolution is enabled; also exercised by ``tests/test_emoji_resolver.py``. Args: formatted_body: The Matrix HTML ``formatted_body`` to scan. Returns: list[MatrixEmojiMatch]: One match per unique ``mxc://`` URL, ordered by first appearance; empty when no emoji images are present. """ seen: set[str] = set() results: list[MatrixEmojiMatch] = [] for m in _MATRIX_EMOJI_IMG_RE.finditer(formatted_body): mxc_url = m.group(1) if mxc_url in seen: continue seen.add(mxc_url) full_tag = m.group(0) alt_match = _MATRIX_IMG_ALT_RE.search(full_tag) alt_text = alt_match.group(1) if alt_match else "emoji" results.append( MatrixEmojiMatch( alt_text=alt_text, mxc_url=mxc_url, full_tag=full_tag, ) ) return results
[docs] def rewrite_matrix_emoji_text( body: str, matches: list[MatrixEmojiMatch], ) -> str: """Swap Matrix emoji shortcodes in the body for ``[emoji: name]`` markers. Rewrites the plain-text body so the legible emoji name survives next to the inline image attachment shown to the LLM. Matrix clients typically put the shortcode (e.g. ``:wave:``) in the plain-text body, so this strips the surrounding colons from each emoji's ``alt`` text and wraps it in descriptive bracket notation, replacing only the first occurrence of each shortcode. Pure string processing with no I/O. Called by the Matrix adapter (``platforms/matrix.py``) after its emoji images are downloaded; also covered by ``tests/test_emoji_resolver.py``. Args: body: The plain-text message body to rewrite. matches: The emojis whose shortcodes should be replaced. Returns: str: The body with each matched shortcode replaced by ``[emoji: name]``; unchanged when no shortcode is present. """ for em in matches: # The alt_text often looks like ":emoji_name:" — clean it up clean = em.alt_text.strip(":") if not clean: clean = "emoji" # The plain body may contain the shortcode version shortcode = f":{clean}:" if shortcode in body: body = body.replace(shortcode, f"[emoji: {clean}]", 1) return body
[docs] async def download_matrix_emojis( matches: list[MatrixEmojiMatch], matrix_client: Any, *, max_emojis: int = 5, media_cache: Any | None = None, ) -> list[Attachment]: """Download up to ``max_emojis`` Matrix custom emojis concurrently. Caps the work at ``max_emojis`` and fetches the chosen emojis in parallel from the Matrix homeserver, returning only those that succeeded so a few failures never sink the batch. This is the download half of the Matrix resolve-emojis-as-images flow whose matches come from :func:`extract_matrix_emojis`. Fans out the inner ``_download_one`` coroutine per emoji via :func:`asyncio.gather` (with ``return_exceptions=True``), where each task fetches media through the matrix-nio ``AsyncClient.download()`` method against the homeserver's ``mxc://`` URIs and wraps the bytes in an :class:`~platforms.base.Attachment`. The optional ``media_cache`` is threaded through so previously fetched emoji bytes are reused on a hit. Imports ``nio.responses.DownloadError`` to detect download failures. Called by the Matrix adapter (``platforms/matrix.py``) when emoji resolution is enabled; no internal callers. Args: matches: Candidate emojis to download (usually from :func:`extract_matrix_emojis`). matrix_client: The matrix-nio ``AsyncClient`` used to fetch media from the homeserver. max_emojis: Maximum number of emojis to fetch from the front of ``matches`` (default ``5``). media_cache: Optional cache exposing ``get_or_download`` to dedupe repeated downloads. Returns: list[Attachment]: The successfully downloaded emoji attachments; failures and the over-the-limit tail are omitted. """ from nio.responses import DownloadError bounded = matches[:max_emojis] if not bounded: return [] async def _download_one(em: MatrixEmojiMatch) -> Attachment | None: """Download a single Matrix custom emoji into an :class:`Attachment`. Closure fanned out by the enclosing :func:`download_matrix_emojis` via :func:`asyncio.gather`, one task per matched emoji. Derives a ``.png`` filename from the emoji's alt text, then fetches the image either through the ``media_cache`` (passing the nested :func:`_dl` closure so the homeserver is only hit on a cache miss) or directly via ``matrix_client.download(mxc=...)`` against the Matrix homeserver. Any failure is logged at warning level and swallowed so one bad emoji never aborts the whole batch. This nested function is defined and used only within :func:`download_matrix_emojis`; it has no other internal callers. Args: em (MatrixEmojiMatch): The matched emoji to download, carrying its ``mxc://`` URL and alt text. Returns: Attachment | None: The downloaded emoji wrapped in an :class:`~platforms.base.Attachment`, or ``None`` if the download or decode failed. """ mxc_url = em.mxc_url clean_name = em.alt_text.strip(":") or "emoji" filename = f"{clean_name}.png" try: async def _fetch_emoji() -> tuple[bytes, str, str]: """Fetch the emoji bytes from the Matrix homeserver (one call). Calls ``matrix_client.download(mxc=...)`` and raises ``RuntimeError`` if nio returns a ``DownloadError``; otherwise returns the body, content type (default ``image/png``) and the enclosing ``filename``. """ resp = await matrix_client.download(mxc=mxc_url) if isinstance(resp, DownloadError): raise RuntimeError( f"Matrix download failed for {mxc_url}: {resp.message}", ) mimetype = resp.content_type or "image/png" return resp.body, mimetype, filename async def _dl() -> tuple[bytes, str, str]: """Download the emoji with bounded retry on transient blips. Passed to ``media_cache.get_or_download`` so the homeserver is contacted (with retry) only on a cache miss. """ return await download_with_retry(_fetch_emoji, label=filename) if media_cache is not None: data, mimetype, fname = await media_cache.get_or_download( mxc_url, _dl, ) else: data, mimetype, fname = await _dl() return Attachment( data=data, mimetype=mimetype, filename=fname, source_url=mxc_url, ) except Exception: logger.warning( "Failed to download Matrix emoji %s (%s)", em.alt_text, mxc_url, exc_info=True, ) return None results = await asyncio.gather( *[_download_one(em) for em in bounded], return_exceptions=True, ) return [r for r in results if isinstance(r, Attachment)]