"""Serialize Discord rich message content into LLM-visible text."""
from __future__ import annotations
import jsonutil as json
from typing import Any
from xml.sax.saxutils import escape, quoteattr
_EMPTY_VALUES = (None, "", [], {})
def _as_text(value: Any) -> str:
"""Coerce an arbitrary value to a plain string, mapping ``None`` to ``""``.
Used throughout this module as the lowest-level normalizer so that embed
attributes (which may be ``None``, ``discord.py`` model objects, ints such
as colors, etc.) can be safely concatenated into the XML-ish output.
This is a leaf helper called by nearly every other function in this module
(``_escaped``, ``_asset_url``, ``_append_tag``, ``_author_values``,
``_footer_values``, ``_append_fields``, ``_append_provider``,
``serialize_rich_content``, ``merge_content_with_rich_content``, ...); it
has no side effects and no external callers.
Args:
value: Any value to stringify.
Returns:
str: ``""`` when ``value`` is ``None``, otherwise ``str(value)``.
"""
if value is None:
return ""
return str(value)
def _escaped(value: Any) -> str:
"""Stringify a value and XML-escape it for safe embedding in tag bodies.
Combines ``_as_text`` with ``xml.sax.saxutils.escape`` so that user- or
embed-supplied text containing ``&``, ``<`` or ``>`` cannot break the
surrounding XML-ish structure that the LLM consumes.
Calls ``_as_text`` then ``escape``; no side effects. Called internally by
``_append_tag``, ``_append_url_tag``, ``_append_fields``, ``serialize_embed``
and ``serialize_rich_content`` to escape tag content; no external callers.
Args:
value: Any value whose text form should be escaped.
Returns:
str: The XML-escaped string form of ``value``.
"""
return escape(_as_text(value))
def _asset_url(value: Any) -> str:
"""Extract the ``url`` from an embed asset, whether dict-shaped or an object.
Discord embed assets (image, thumbnail, video) arrive either as ``discord.py``
proxy objects exposing a ``.url`` attribute or as raw dicts with a ``"url"``
key depending on how the embed was constructed; this normalizes both.
Reads the ``url`` key/attribute and delegates to ``_as_text``; no side
effects. Called by ``_append_url_tag`` (which renders ``<image>``,
``<thumbnail>`` and ``<video>`` tags); no external callers.
Args:
value: An embed asset as a ``dict`` (using key ``"url"``) or an object
exposing a ``.url`` attribute. May be falsy/``None``.
Returns:
str: The asset URL, or ``""`` when absent.
"""
if isinstance(value, dict):
return _as_text(value.get("url"))
return _as_text(getattr(value, "url", ""))
def _dict_from_embed(embed: Any) -> dict[str, Any]:
"""Return the raw dict form of an embed via its ``to_dict()`` method.
Provides the ``raw`` fallback dict that ``serialize_embed`` and its helpers
consult whenever a ``discord.py`` model attribute is missing or empty, so
that fields present only in the raw API payload are still surfaced. Failure
is swallowed and treated as "no raw data" to keep serialization best-effort.
Calls ``embed.to_dict()`` if it exists and is callable; has no side effects.
Called once at the top of ``serialize_embed`` to seed the ``raw`` dict passed
to ``_author_values``, ``_footer_values``, ``_iter_fields``,
``_append_provider`` and ``_append_raw_fallback``; no external callers.
Args:
embed: A Discord embed object, ideally exposing ``to_dict()``.
Returns:
dict[str, Any]: The embed's dict representation, or ``{}`` when ``embed``
has no callable ``to_dict``, it returns a non-dict, or it raises.
"""
to_dict = getattr(embed, "to_dict", None)
if callable(to_dict):
try:
raw = to_dict()
if isinstance(raw, dict):
return raw
except Exception:
return {}
return {}
def _append_tag(lines: list[str], tag: str, value: Any) -> None:
"""Append an indented ``<tag>escaped-value</tag>`` line when the value is set.
Skips emission entirely for empty/``None`` values so that absent embed
fields leave no trace in the output. The value text is XML-escaped.
Mutates ``lines`` in place (appends one string); delegates to ``_as_text``
and ``_escaped``. Called by ``serialize_embed`` to render the ``title``,
``url``, ``description``, ``timestamp`` and ``color`` tags; no external
callers.
Args:
lines: The accumulating list of output lines; appended to in place.
tag: The XML tag name to wrap the value in (e.g. ``"title"``).
value: The value to render; falsy values are skipped.
Returns:
None: ``lines`` is mutated in place.
"""
text = _as_text(value)
if text:
lines.append(f" <{tag}>{_escaped(text)}</{tag}>")
def _append_url_tag(lines: list[str], tag: str, value: Any) -> None:
"""Append an indented ``<tag>url</tag>`` line for an embed asset, if it has a URL.
Like ``_append_tag`` but extracts the URL from an asset object/dict first via
``_asset_url``; nothing is emitted when the asset has no URL.
Mutates ``lines`` in place; calls ``_asset_url`` and ``_escaped``. Called by
``serialize_embed`` for the ``image``, ``thumbnail`` and ``video`` tags; no
external callers.
Args:
lines: The accumulating list of output lines; appended to in place.
tag: The XML tag name to wrap the URL in (e.g. ``"image"``).
value: The embed asset (dict with ``"url"`` or object with ``.url``).
Returns:
None: ``lines`` is mutated in place.
"""
url = _asset_url(value)
if url:
lines.append(f" <{tag}>{_escaped(url)}</{tag}>")
def _author_values(embed: Any, raw: dict[str, Any]) -> tuple[str, str, str]:
"""Resolve an embed author's name, URL and icon URL from object or raw dict.
Prefers the ``discord.py`` ``embed.author`` attributes and falls back to the
raw API dict, also tolerating both ``icon_url`` and the older ``icon``
attribute names, so authorship survives regardless of embed source.
Reads ``embed.author`` and ``raw["author"]`` and delegates to ``_as_text``;
no side effects. Called by ``serialize_embed`` to build the ``<author>`` tag;
no external callers.
Args:
embed: The Discord embed object, possibly exposing ``.author``.
raw: The embed's raw dict (from ``_dict_from_embed``), possibly holding
an ``"author"`` sub-dict.
Returns:
tuple[str, str, str]: ``(name, url, icon_url)``, each ``""`` when absent.
"""
author = getattr(embed, "author", None)
raw_author = raw.get("author") if isinstance(raw.get("author"), dict) else {}
name = _as_text(getattr(author, "name", "") or raw_author.get("name"))
url = _as_text(getattr(author, "url", "") or raw_author.get("url"))
icon_url = _as_text(
getattr(author, "icon_url", "")
or getattr(author, "icon", "")
or raw_author.get("icon_url")
)
return name, url, icon_url
def _footer_values(embed: Any, raw: dict[str, Any]) -> tuple[str, str]:
"""Resolve an embed footer's text and icon URL from object or raw dict.
Mirrors ``_author_values`` for the footer: prefers the ``discord.py``
``embed.footer`` attributes, falls back to ``raw["footer"]``, and accepts
either ``icon_url`` or the legacy ``icon`` attribute.
Reads ``embed.footer`` and ``raw["footer"]`` and delegates to ``_as_text``;
no side effects. Called by ``serialize_embed`` to build the ``<footer>`` tag;
no external callers.
Args:
embed: The Discord embed object, possibly exposing ``.footer``.
raw: The embed's raw dict, possibly holding a ``"footer"`` sub-dict.
Returns:
tuple[str, str]: ``(text, icon_url)``, each ``""`` when absent.
"""
footer = getattr(embed, "footer", None)
raw_footer = raw.get("footer") if isinstance(raw.get("footer"), dict) else {}
text = _as_text(getattr(footer, "text", "") or raw_footer.get("text"))
icon_url = _as_text(
getattr(footer, "icon_url", "")
or getattr(footer, "icon", "")
or raw_footer.get("icon_url")
)
return text, icon_url
def _iter_fields(embed: Any, raw: dict[str, Any]) -> list[Any]:
"""Return the embed's field list, preferring the object over the raw dict.
Uses ``embed.fields`` when it contains any items; otherwise falls back to
``raw["fields"]`` (only when it is a list), so embeds parsed only from the
raw API payload still expose their fields.
Reads ``embed.fields`` and ``raw["fields"]``; no side effects. Called by
``_append_fields``; no external callers.
Args:
embed: The Discord embed object, possibly exposing ``.fields``.
raw: The embed's raw dict, possibly holding a ``"fields"`` list.
Returns:
list[Any]: The list of field objects/dicts, or ``[]`` when none exist.
"""
fields = list(getattr(embed, "fields", None) or [])
if fields:
return fields
raw_fields = raw.get("fields")
return raw_fields if isinstance(raw_fields, list) else []
def _append_fields(lines: list[str], embed: Any, raw: dict[str, Any]) -> None:
"""Append a ``<field name="...">value</field>`` line for each embed field.
Iterates the embed's fields (handling both dict-shaped and object-shaped
entries), skipping any field that has neither a name nor a value. The name
is rendered as a quoted XML attribute and the value is escaped in the body.
Mutates ``lines`` in place; calls ``_iter_fields``, ``_as_text``, ``_escaped``
and ``xml.sax.saxutils.quoteattr``. Called by ``serialize_embed``; no external
callers.
Args:
lines: The accumulating list of output lines; appended to in place.
embed: The Discord embed object whose fields to render.
raw: The embed's raw dict, used as a field source fallback.
Returns:
None: ``lines`` is mutated in place.
"""
for field in _iter_fields(embed, raw):
if isinstance(field, dict):
name = _as_text(field.get("name"))
value = _as_text(field.get("value"))
else:
name = _as_text(getattr(field, "name", ""))
value = _as_text(getattr(field, "value", ""))
if not name and not value:
continue
lines.append(f" <field name={quoteattr(name)}>{_escaped(value)}</field>")
def _append_provider(lines: list[str], raw: dict[str, Any]) -> None:
"""Append a self-closing ``<provider .../>`` line from the raw embed dict.
Reads provider info solely from ``raw["provider"]`` (provider is only present
on the raw API payload), rendering whichever of ``name`` and ``url`` exist as
quoted attributes; emits nothing when the provider is absent or empty.
Mutates ``lines`` in place; calls ``_as_text`` and ``quoteattr``. Called by
``serialize_embed``; no external callers.
Args:
lines: The accumulating list of output lines; appended to in place.
raw: The embed's raw dict, possibly holding a ``"provider"`` sub-dict.
Returns:
None: ``lines`` is mutated in place.
"""
provider = raw.get("provider")
if not isinstance(provider, dict):
return
name = _as_text(provider.get("name"))
url = _as_text(provider.get("url"))
if not name and not url:
return
attrs = []
if name:
attrs.append(f"name={quoteattr(name)}")
if url:
attrs.append(f"url={quoteattr(url)}")
lines.append(f" <provider {' '.join(attrs)} />")
def _append_raw_fallback(lines: list[str], raw: dict[str, Any]) -> None:
"""Append a ``<raw_json>`` line for any embed keys not handled explicitly.
Collects every non-empty key in the raw embed dict that is not already
rendered by a dedicated tag (the ``known`` set), then serializes those
leftovers to deterministic JSON (``sort_keys=True``, ``default=str``) and
XML-escapes them, so novel or unexpected embed fields are still visible to
the LLM rather than silently dropped. Emits nothing when there are no extras.
Mutates ``lines`` in place; calls ``jsonutil.dumps`` (imported as ``json``)
and ``xml.sax.saxutils.escape``. Called by ``serialize_embed`` as the final
step before closing the ``<embed>`` block; no external callers.
Args:
lines: The accumulating list of output lines; appended to in place.
raw: The embed's raw dict (from ``_dict_from_embed``) to scan for extras.
Returns:
None: ``lines`` is mutated in place.
"""
known = {
"author",
"color",
"colour",
"description",
"fields",
"footer",
"image",
"provider",
"thumbnail",
"timestamp",
"title",
"type",
"url",
"video",
}
extra = {
key: value
for key, value in raw.items()
if key not in known and value not in _EMPTY_VALUES
}
if extra:
lines.append(
" <raw_json>"
+ escape(json.dumps(extra, sort_keys=True, default=str))
+ "</raw_json>"
)
[docs]
def serialize_embed(embed: Any) -> str:
"""Serialize a single Discord embed into XML-ish text for the LLM.
Renders an embed (author, title, url, description, fields, image,
thumbnail, footer, video, timestamp, color, provider) into an
indented ``<embed>...</embed>`` block so the model can read structured
embed content that would otherwise be invisible in the plain message
body. Any embed keys not covered by a dedicated tag are dumped into a
``<raw_json>`` fallback rather than dropped, and an embed with no
renderable content yields an empty string.
It seeds a ``raw`` dict via ``_dict_from_embed`` and then delegates to
the module's ``_author_values``, ``_footer_values``, ``_append_tag``,
``_append_url_tag``, ``_append_fields``, ``_append_provider`` and
``_append_raw_fallback`` helpers; it performs no I/O and no mutation of
``embed``. Called by ``serialize_rich_content`` (for both top-level and
forwarded-snapshot embeds) in this module, and exercised directly by
``tests/test_discord_rich_content.py``; no other callers.
Args:
embed: A Discord embed, ideally a ``discord.py`` / ``selfcord``
``Embed`` exposing both attributes and ``to_dict()``.
Returns:
str: The serialized ``<embed>`` block, or ``""`` when the embed has
no renderable content.
"""
raw = _dict_from_embed(embed)
lines = ["<embed>"]
author_name, author_url, author_icon_url = _author_values(embed, raw)
if author_name:
attrs = []
if author_url:
attrs.append(f"url={quoteattr(author_url)}")
if author_icon_url:
attrs.append(f"icon_url={quoteattr(author_icon_url)}")
suffix = f" {' '.join(attrs)}" if attrs else ""
lines.append(f" <author{suffix}>{_escaped(author_name)}</author>")
_append_tag(lines, "title", getattr(embed, "title", "") or raw.get("title"))
_append_tag(lines, "url", getattr(embed, "url", "") or raw.get("url"))
_append_tag(
lines,
"description",
getattr(embed, "description", "") or raw.get("description"),
)
_append_fields(lines, embed, raw)
_append_url_tag(lines, "image", getattr(embed, "image", None) or raw.get("image"))
_append_url_tag(
lines,
"thumbnail",
getattr(embed, "thumbnail", None) or raw.get("thumbnail"),
)
footer_text, footer_icon_url = _footer_values(embed, raw)
if footer_text:
suffix = f" icon_url={quoteattr(footer_icon_url)}" if footer_icon_url else ""
lines.append(f" <footer{suffix}>{_escaped(footer_text)}</footer>")
_append_url_tag(lines, "video", getattr(embed, "video", None) or raw.get("video"))
_append_tag(
lines, "timestamp", getattr(embed, "timestamp", "") or raw.get("timestamp")
)
_append_tag(lines, "color", getattr(embed, "colour", "") or raw.get("color"))
_append_provider(lines, raw)
_append_raw_fallback(lines, raw)
lines.append("</embed>")
if len(lines) <= 2:
return ""
return "\n".join(lines)
[docs]
def serialize_rich_content(message: Any) -> str:
"""Convert a message's embeds, snapshots, stickers and polls to text.
Walks every non-textual artifact attached to a Discord message and
renders each into LLM-readable text so the model perceives the same
rich content a human sees: embeds become ``<embed>`` blocks,
forwarded ``message_snapshots`` become ``<forwarded_message>`` blocks
(with their own nested content, embeds and attachments), stickers
become ``[Sticker: name]`` lines, and a poll becomes a ``<poll>`` block
listing its options. Parts are joined with newlines; a message with no
rich content yields an empty string.
Reads only attributes off ``message`` (``embeds``, ``message_snapshots``,
``stickers``, ``poll``) and delegates per-embed rendering to
``serialize_embed``; it performs no I/O. Called by
``merge_content_with_rich_content`` in this module, by the Discord bot
and selfbot adapters (``platforms/discord.py`` and
``platforms/discord_self.py`` import it as ``_serialize_rich_content`` to
diff edited messages), and by ``tests/test_discord_rich_content.py``.
Args:
message: A Discord message object exposing any of ``embeds``,
``message_snapshots``, ``stickers`` and ``poll``.
Returns:
str: The newline-joined serialized rich content, or ``""`` when the
message carries none.
"""
parts: list[str] = []
for embed in getattr(message, "embeds", None) or []:
serialized = serialize_embed(embed)
if serialized:
parts.append(serialized)
for snapshot in getattr(message, "message_snapshots", None) or []:
lines = ["<forwarded_message>"]
content = getattr(snapshot, "content", "")
if content:
lines.append(f" <content>{_escaped(content)}</content>")
for embed in getattr(snapshot, "embeds", None) or []:
serialized = serialize_embed(embed)
if serialized:
lines.append(serialized)
for att in getattr(snapshot, "attachments", None) or []:
url = _as_text(getattr(att, "url", ""))
filename = _as_text(getattr(att, "filename", ""))
if url or filename:
lines.append(
f" <attachment url={quoteattr(url)}>"
f"{_escaped(filename)}</attachment>"
)
lines.append("</forwarded_message>")
if len(lines) > 2:
parts.append("\n".join(lines))
for sticker in getattr(message, "stickers", None) or []:
name = _as_text(getattr(sticker, "name", ""))
if name:
parts.append(f"[Sticker: {name}]")
poll = getattr(message, "poll", None)
if poll is not None:
question = getattr(poll, "question", "")
question_text = getattr(question, "text", None) or _as_text(question)
lines = [f"<poll question={quoteattr(question_text)}>"]
for answer in getattr(poll, "answers", None) or []:
answer_text = getattr(answer, "text", None) or _as_text(answer)
if answer_text:
lines.append(f" <option>{_escaped(answer_text)}</option>")
lines.append("</poll>")
parts.append("\n".join(lines))
return "\n".join(parts)
[docs]
def merge_content_with_rich_content(content: Any, message: Any) -> str:
"""Combine a message's plain text with its serialized rich content.
Produces the single text blob the rest of the pipeline stores and feeds
to the LLM by appending the embeds/snapshots/stickers/polls rendering to
the message's plain ``content`` (separated by a newline), so structured
artifacts are not lost when a message is reduced to text. When there is
no rich content the plain text is returned unchanged, and when there is
no plain text only the rich content is returned.
Stringifies ``content`` via ``_as_text`` and obtains the rich portion
from ``serialize_rich_content``; no I/O. Called wherever a Discord
message is flattened to history/prompt text: the bot adapter
(``platforms/discord.py``) and selfbot adapter (this package's
``platforms/discord_self.py``) on-message and history paths, the
knowledge-graph builder ``build_kg.py``, and the ``tools/discord_embed.py``
tool; also covered by ``tests/test_discord_rich_content.py``.
Args:
content: The message's plain text content (any value; coerced to
string, with ``None`` treated as empty).
message: The Discord message whose rich content should be appended.
Returns:
str: The merged text, the plain text alone, or the rich content
alone depending on which parts are present.
"""
text = _as_text(content)
rich = serialize_rich_content(message)
if rich:
return f"{text}\n{rich}" if text else rich
return text