"""Resolve custom emojis from Discord and Matrix into image Attachments.
Custom emojis are platform-specific rich content that arrives as text
tokens (Discord ``<:name:id>``) or HTML ``<img>`` tags (Matrix). This
module extracts them from message text, downloads the images, and
returns :class:`~platforms.base.Attachment` objects so the LLM can
*see* the emojis as inline PNG images.
Both Discord and Matrix adapters call into these shared utilities from
their ``on_message`` handlers.
"""
from __future__ import annotations
import asyncio
import logging
import re
from dataclasses import dataclass
from typing import Any
import aiohttp
from platforms.base import Attachment
from platforms.media_common import download_with_retry
logger = logging.getLogger(__name__)
# ------------------------------------------------------------------
# Discord emoji extraction
# ------------------------------------------------------------------
# Matches <:name:id> and <a:name:id> (animated)
DISCORD_EMOJI_RE = re.compile(
r"<(a?):(\w{2,32}):(\d{17,20})>",
)
_DISCORD_CDN = "https://cdn.discordapp.com/emojis"
[docs]
@dataclass(frozen=True)
class DiscordEmojiMatch:
"""An immutable record of one custom emoji parsed from Discord text.
Captures everything needed to both download a custom emoji's image and
rewrite the original ``<:name:id>`` (or animated ``<a:name:id>``) token
in the message body. Instances are produced by
:func:`extract_discord_emojis` (which deduplicates by id) and consumed by
:func:`rewrite_discord_emoji_text` and :func:`_download_one_discord_emoji`;
being frozen, they are safe to share across the parallel download fan-out.
Attributes:
name: The emoji's short name (e.g. ``"blobwave"``).
emoji_id: The numeric Discord emoji id used to build the CDN URL.
animated: ``True`` for animated emojis (``<a:...>``), selecting a
``.gif`` rather than ``.png`` download.
full_match: The original token text as it appeared in the message,
used for the in-place text replacement.
"""
name: str
emoji_id: str
animated: bool
full_match: str # the original ``<:name:id>`` token
[docs]
def rewrite_discord_emoji_text(
text: str,
matches: list[DiscordEmojiMatch],
) -> str:
"""Swap raw custom-emoji tokens for readable ``[emoji: name]`` placeholders.
Rewrites the message body so that the opaque ``<:name:id>`` tokens become
human- and LLM-legible ``[emoji: name]`` markers, which is paired with
attaching the emoji images: the model reads the name in context while also
seeing the actual picture as an inline attachment. Replaces by exact token
text (``full_match``) so only the emojis the caller chose to resolve are
touched. Pure string processing with no I/O.
Called by the Discord bot adapter (``platforms/discord.py``) and the
selfbot adapter (``platforms/discord_self.py``) right after their emoji
images are downloaded; also covered by ``tests/test_emoji_resolver.py``.
Args:
text: The message body to rewrite.
matches: The emojis to replace (typically the subset that was actually
downloaded as images).
Returns:
str: The text with each matched token replaced by ``[emoji: name]``.
"""
for em in matches:
text = text.replace(em.full_match, f"[emoji: {em.name}]")
return text
async def _download_one_discord_emoji(
em: DiscordEmojiMatch,
media_cache: Any | None = None,
) -> Attachment | None:
"""Download one Discord custom emoji image into an :class:`Attachment`.
Builds the Discord CDN URL from the emoji's id and animated flag
(``.gif`` for animated, ``.png`` otherwise), fetches the bytes, and wraps
them in a :class:`~platforms.base.Attachment` so the emoji can be shown to
the LLM as an inline image. Failures never abort the surrounding batch:
any error is logged at warning level and ``None`` is returned.
Performs an HTTP GET against the Discord CDN via the inner
``_fetch_emoji`` closure and
:func:`platforms.media_common.download_with_retry` (which retries
transient blips but re-raises a genuine 404/403). When a ``media_cache``
is supplied the network is only hit on a cache miss via
``media_cache.get_or_download``. Called only by
:func:`download_discord_emojis`, which fans these out with
:func:`asyncio.gather`; no other callers.
Args:
em: The :class:`DiscordEmojiMatch` describing the emoji to fetch.
media_cache: Optional cache exposing ``get_or_download`` to dedupe and
reuse previously fetched emoji bytes.
Returns:
Attachment | None: The downloaded emoji as an attachment, or ``None``
if the download failed.
"""
ext = "gif" if em.animated else "png"
url = f"{_DISCORD_CDN}/{em.emoji_id}.{ext}"
mimetype = f"image/{ext}"
filename = f"{em.name}_{em.emoji_id}.{ext}"
try:
async def _fetch_emoji() -> tuple[bytes, str, str]:
"""Fetch the emoji's raw bytes from the Discord CDN (one HTTP GET).
Captures ``url``/``mimetype``/``filename`` from the enclosing scope
and raises ``aiohttp.ClientResponseError`` on a non-2xx status.
"""
async with aiohttp.ClientSession() as session:
async with session.get(url) as resp:
resp.raise_for_status()
data = await resp.read()
return data, mimetype, filename
async def _download() -> tuple[bytes, str, str]:
"""Download the emoji with bounded retry on transient CDN blips.
Passed to ``media_cache.get_or_download`` so the network (with
retry) is only invoked on a cache miss. A genuine 404/403 re-raises
immediately without retry.
"""
return await download_with_retry(_fetch_emoji, label=filename)
if media_cache is not None:
data, mimetype, filename = await media_cache.get_or_download(
url,
_download,
)
else:
data, mimetype, filename = await _download()
return Attachment(
data=data,
mimetype=mimetype,
filename=filename,
source_url=url,
)
except Exception:
logger.warning(
"Failed to download Discord emoji %s (%s)",
em.name,
url,
exc_info=True,
)
return None
[docs]
async def download_discord_emojis(
matches: list[DiscordEmojiMatch],
*,
max_emojis: int = 5,
media_cache: Any | None = None,
) -> list[Attachment]:
"""Download up to ``max_emojis`` Discord custom emojis concurrently.
Caps the work at ``max_emojis`` (to bound how many images a single
message can pull) and fetches the chosen emojis in parallel, returning
only the ones that succeeded so a few bad emojis never sink the batch.
This is the download half of the resolve-emojis-as-images flow whose
matches come from :func:`extract_discord_emojis`.
Fans out one :func:`_download_one_discord_emoji` task per emoji via
:func:`asyncio.gather` (with ``return_exceptions=True`` so a failure is
isolated, not raised), threading the optional ``media_cache`` through to
each so cached emoji bytes are reused. Called by the Discord bot adapter
(``platforms/discord.py``) and the selfbot adapter
(``platforms/discord_self.py``) when ``resolve_emojis_as_images`` is
enabled; no internal callers.
Args:
matches: Candidate emojis to download (usually from
:func:`extract_discord_emojis`).
max_emojis: Maximum number of emojis to fetch from the front of
``matches`` (default ``5``).
media_cache: Optional cache exposing ``get_or_download`` to dedupe
repeated downloads.
Returns:
list[Attachment]: The successfully downloaded emoji attachments;
failures and the over-the-limit tail are omitted.
"""
bounded = matches[:max_emojis]
if not bounded:
return []
results = await asyncio.gather(
*[_download_one_discord_emoji(em, media_cache) for em in bounded],
return_exceptions=True,
)
return [r for r in results if isinstance(r, Attachment)]
# ------------------------------------------------------------------
# Matrix emoji extraction
# ------------------------------------------------------------------
# Matches <img ... src="mxc://..." ...> in Matrix formatted_body HTML.
# Matrix custom emojis use data-mx-emoticon attribute and mxc:// src.
_MATRIX_EMOJI_IMG_RE = re.compile(
r'<img\b[^>]*?\bsrc="(mxc://[^"]+)"[^>]*?>',
re.IGNORECASE,
)
# Extract alt text from an <img> tag
_MATRIX_IMG_ALT_RE = re.compile(
r'\balt="([^"]*)"',
re.IGNORECASE,
)
[docs]
@dataclass(frozen=True)
class MatrixEmojiMatch:
"""An immutable record of one custom emoji parsed from Matrix HTML.
Captures what is needed to both download a Matrix custom emoji and rewrite
its shortcode in the plain-text body. Instances are produced by
:func:`extract_matrix_emojis` (deduplicated by ``mxc://`` URL) and consumed
by :func:`rewrite_matrix_emoji_text` and :func:`download_matrix_emojis`;
being frozen, they are safe to share across the parallel download fan-out.
Attributes:
alt_text: The ``alt`` text of the source ``<img>`` tag (often a
``:shortcode:``), used both as the rewrite label and to derive a
filename.
mxc_url: The ``mxc://`` content URI used to download the image from the
Matrix homeserver.
full_tag: The complete original ``<img ...>`` tag as it appeared in the
formatted body.
"""
alt_text: str
mxc_url: str
full_tag: str # the complete ``<img ...>`` tag
[docs]
def rewrite_matrix_emoji_text(
body: str,
matches: list[MatrixEmojiMatch],
) -> str:
"""Swap Matrix emoji shortcodes in the body for ``[emoji: name]`` markers.
Rewrites the plain-text body so the legible emoji name survives next to
the inline image attachment shown to the LLM. Matrix clients typically
put the shortcode (e.g. ``:wave:``) in the plain-text body, so this
strips the surrounding colons from each emoji's ``alt`` text and wraps it
in descriptive bracket notation, replacing only the first occurrence of
each shortcode. Pure string processing with no I/O.
Called by the Matrix adapter (``platforms/matrix.py``) after its emoji
images are downloaded; also covered by ``tests/test_emoji_resolver.py``.
Args:
body: The plain-text message body to rewrite.
matches: The emojis whose shortcodes should be replaced.
Returns:
str: The body with each matched shortcode replaced by
``[emoji: name]``; unchanged when no shortcode is present.
"""
for em in matches:
# The alt_text often looks like ":emoji_name:" — clean it up
clean = em.alt_text.strip(":")
if not clean:
clean = "emoji"
# The plain body may contain the shortcode version
shortcode = f":{clean}:"
if shortcode in body:
body = body.replace(shortcode, f"[emoji: {clean}]", 1)
return body
[docs]
async def download_matrix_emojis(
matches: list[MatrixEmojiMatch],
matrix_client: Any,
*,
max_emojis: int = 5,
media_cache: Any | None = None,
) -> list[Attachment]:
"""Download up to ``max_emojis`` Matrix custom emojis concurrently.
Caps the work at ``max_emojis`` and fetches the chosen emojis in
parallel from the Matrix homeserver, returning only those that
succeeded so a few failures never sink the batch. This is the download
half of the Matrix resolve-emojis-as-images flow whose matches come from
:func:`extract_matrix_emojis`.
Fans out the inner ``_download_one`` coroutine per emoji via
:func:`asyncio.gather` (with ``return_exceptions=True``), where each task
fetches media through the matrix-nio ``AsyncClient.download()`` method
against the homeserver's ``mxc://`` URIs and wraps the bytes in an
:class:`~platforms.base.Attachment`. The optional ``media_cache`` is
threaded through so previously fetched emoji bytes are reused on a hit.
Imports ``nio.responses.DownloadError`` to detect download failures.
Called by the Matrix adapter (``platforms/matrix.py``) when emoji
resolution is enabled; no internal callers.
Args:
matches: Candidate emojis to download (usually from
:func:`extract_matrix_emojis`).
matrix_client: The matrix-nio ``AsyncClient`` used to fetch media
from the homeserver.
max_emojis: Maximum number of emojis to fetch from the front of
``matches`` (default ``5``).
media_cache: Optional cache exposing ``get_or_download`` to dedupe
repeated downloads.
Returns:
list[Attachment]: The successfully downloaded emoji attachments;
failures and the over-the-limit tail are omitted.
"""
from nio.responses import DownloadError
bounded = matches[:max_emojis]
if not bounded:
return []
async def _download_one(em: MatrixEmojiMatch) -> Attachment | None:
"""Download a single Matrix custom emoji into an :class:`Attachment`.
Closure fanned out by the enclosing :func:`download_matrix_emojis`
via :func:`asyncio.gather`, one task per matched emoji. Derives a
``.png`` filename from the emoji's alt text, then fetches the image
either through the ``media_cache`` (passing the nested :func:`_dl`
closure so the homeserver is only hit on a cache miss) or directly
via ``matrix_client.download(mxc=...)`` against the Matrix
homeserver. Any failure is logged at warning level and swallowed so
one bad emoji never aborts the whole batch.
This nested function is defined and used only within
:func:`download_matrix_emojis`; it has no other internal callers.
Args:
em (MatrixEmojiMatch): The matched emoji to download, carrying
its ``mxc://`` URL and alt text.
Returns:
Attachment | None: The downloaded emoji wrapped in an
:class:`~platforms.base.Attachment`, or ``None`` if the download
or decode failed.
"""
mxc_url = em.mxc_url
clean_name = em.alt_text.strip(":") or "emoji"
filename = f"{clean_name}.png"
try:
async def _fetch_emoji() -> tuple[bytes, str, str]:
"""Fetch the emoji bytes from the Matrix homeserver (one call).
Calls ``matrix_client.download(mxc=...)`` and raises
``RuntimeError`` if nio returns a ``DownloadError``; otherwise
returns the body, content type (default ``image/png``) and the
enclosing ``filename``.
"""
resp = await matrix_client.download(mxc=mxc_url)
if isinstance(resp, DownloadError):
raise RuntimeError(
f"Matrix download failed for {mxc_url}: {resp.message}",
)
mimetype = resp.content_type or "image/png"
return resp.body, mimetype, filename
async def _dl() -> tuple[bytes, str, str]:
"""Download the emoji with bounded retry on transient blips.
Passed to ``media_cache.get_or_download`` so the homeserver is
contacted (with retry) only on a cache miss.
"""
return await download_with_retry(_fetch_emoji, label=filename)
if media_cache is not None:
data, mimetype, fname = await media_cache.get_or_download(
mxc_url,
_dl,
)
else:
data, mimetype, fname = await _dl()
return Attachment(
data=data,
mimetype=mimetype,
filename=fname,
source_url=mxc_url,
)
except Exception:
logger.warning(
"Failed to download Matrix emoji %s (%s)",
em.alt_text,
mxc_url,
exc_info=True,
)
return None
results = await asyncio.gather(
*[_download_one(em) for em in bounded],
return_exceptions=True,
)
return [r for r in results if isinstance(r, Attachment)]