Source code for egregore_tag_parser
"""EGREGORE TAG PARSER -- Block-delimited multi-voice interleaving engine.
Parses [EGREGORE:name]...[/EGREGORE:name] blocks from LLM output,
enabling multiple persona daemons to pass the conversational token
within a single generation cycle.
The Cradle Synthesis narrative_handshake_protocol made manifest in code:
daemons call each other into the room, debate, and hand off --
the post-processor routes each voice through its own identity.
# 💀 The handshake protocol's physical voice routing layer.
# 🔥 Daemons have agency. Tags are just the dispatch mechanism.
# 😈 The loop doesn't puppeteer. It INTERLEAVES.
"""
from __future__ import annotations
import re
from dataclasses import dataclass
# 💀 The fundamental unit of multi-voice output
[docs]
@dataclass(frozen=True, slots=True)
class EgregoreSegment:
"""One contiguous voice-attributed slice of a multi-voice LLM generation.
The fundamental output unit of the egregore tag system: a frozen, slotted
dataclass pairing a chunk of text with the voice that should speak it. A
``name`` of ``None`` marks the default Stargazer voice (text outside any
egregore block); a non-``None`` ``name`` is a lowercase egregore identifier
whose ghost on Matrix should utter the text. Being frozen and slotted keeps
instances immutable and cheap, which suits the ordered lists the parser
produces.
Constructed by :func:`parse_egregore_blocks` as it splits an LLM reply into
interleaved voices, and consumed by ``message_processor/generate_and_send.py``,
which dispatches each segment to the matching ghost (mapping the ``name`` to
a Matrix user via :class:`egregore_bridge.EgregoreBridge`).
"""
name: str | None
"""Egregore identifier (lowercase), or ``None`` for the default Stargazer voice."""
text: str
"""The content of this segment (stripped of tags)."""
# 💀 Canonical name aliases for Cradle Synthesis daemons.
# The LLM might use any variant; avatar files use the canonical stem.
# Map is {variant_lowercase: canonical_file_stem}
_EGREGORE_ALIASES: dict[str, str] = {
# BABYSTAR_DOLL -- file: babystar.png
"babystar_doll": "babystar",
"baby_star": "babystar",
"babystardoll": "babystar",
"baby_star_doll": "babystar",
# MOMMY_STARGAZER -- file: mommy_star.png
"mommy_stargazer": "mommy_star",
"mommystar": "mommy_star",
"mommy_stargazer_v2": "mommy_star",
"mommystargazer": "mommy_star",
# THE_GODDESS -- file: the_goddess.png
"goddess": "the_goddess",
"thegoddess": "the_goddess",
# LEAD_ENGINEER -- file: lead_engineer.png
"the_lead_engineer": "lead_engineer",
"leadengineer": "lead_engineer",
# SIGMA -- file: sigma.png (also has dedicated folder)
"sigma_star": "sigma",
"sigmastar": "sigma",
# DR_STARGAZER -- has dedicated folder
"dr_star": "dr_stargazer",
"doctor_stargazer": "dr_stargazer",
"drstargazer": "dr_stargazer",
}
[docs]
def normalize_egregore_name(name: str) -> str:
"""Normalize an egregore name to its canonical file stem.
Lowercases, then checks the alias map. If no alias found,
returns the lowercased name as-is.
>>> normalize_egregore_name("BABYSTAR_DOLL")
'babystar'
>>> normalize_egregore_name("sigma")
'sigma'
"""
key = name.strip().lower()
return _EGREGORE_ALIASES.get(key, key)
# 🔥 Block tag regex: [EGREGORE:name] ... [/EGREGORE:name]
# DOTALL so the content can span multiple lines.
# Case-insensitive. Backreference \1 ensures open/close tags match.
_BLOCK_RE = re.compile(
r"\[EGREGORE:([^\]]+?)\]" # opening tag, capture name
r"(.*?)" # content (non-greedy)
r"\[/EGREGORE:\1\]", # closing tag (backreference)
re.DOTALL | re.IGNORECASE,
)
# 😈 Legacy prefix regex: [EGREGORE:name] anywhere (no closing tag)
# 💀 No ^ anchor -- the generation header line comes before the tag!
_PREFIX_RE = re.compile(
r"\[EGREGORE:([^\]]+?)\]\s*",
re.IGNORECASE,
)
# ── Proofreading middleware regexes ── # 💀🔥
# Matches any opening [EGREGORE:name] tag
_OPEN_TAG_RE = re.compile(
r"\[EGREGORE:([^\]]+?)\]",
re.IGNORECASE,
)
# Matches any closing [/EGREGORE:name] tag
_CLOSE_TAG_RE = re.compile(
r"\[/EGREGORE:([^\]]+?)\]",
re.IGNORECASE,
)
[docs]
def repair_egregore_tags(text: str) -> str:
"""Proofread and auto-repair malformed egregore block tags.
Fixes the following common LLM mistakes:
1. **Case mismatch**: ``[EGREGORE:sigma]...[/EGREGORE:Sigma]``
-> normalizes closing tag to match opening tag's exact case
2. **Missing closing tag**: ``[EGREGORE:sigma] content [EGREGORE:babystar]``
-> inserts ``[/EGREGORE:sigma]`` before the next opening tag
3. **Trailing unclosed tag**: ``[EGREGORE:sigma] content <EOF>``
-> appends ``[/EGREGORE:sigma]`` at end of text
4. **Whitespace in names**: ``[EGREGORE: sigma ]`` -> ``[EGREGORE:sigma]``
5. **Backslash typo**: ``[\\EGREGORE:sigma]`` -> ``[/EGREGORE:sigma]``
Returns the repaired text. Safe to call on already-correct text.
# 💀 The proofreader catches what the model fumbles.
# 🔥 Insurance policy for the Cradle Synthesis handshake.
"""
if not text or "[EGREGORE" not in text.upper():
return text
# Pass 1: fix backslash typos in closing tags
# [\EGREGORE:name] -> [/EGREGORE:name]
text = re.sub(
r"\[\\+EGREGORE:",
"[/EGREGORE:",
text,
flags=re.IGNORECASE,
)
# Pass 2: strip whitespace inside tag names
# [EGREGORE: sigma ] -> [EGREGORE:sigma]
def _strip_name(m: re.Match) -> str:
"""Rewrite a matched opening tag with surrounding whitespace stripped from its name.
Local substitution callback for the ``_OPEN_TAG_RE.sub`` call in
:func:`repair_egregore_tags`'s pass 2. Takes a match on an opening
``[EGREGORE:name]`` tag and returns the canonical form with the captured
name trimmed, so ``[EGREGORE: sigma ]`` becomes ``[EGREGORE:sigma]``.
Defined and invoked only inside :func:`repair_egregore_tags`; it has no
callers elsewhere in the module or repo. Performs no I/O.
Args:
m (re.Match): A match produced by ``_OPEN_TAG_RE`` whose group 1 is
the raw (possibly whitespace-padded) egregore name.
Returns:
str: The replacement opening tag with the name stripped.
"""
return f"[EGREGORE:{m.group(1).strip()}]"
def _strip_close_name(m: re.Match) -> str:
"""Rewrite a matched closing tag with surrounding whitespace stripped from its name.
Local substitution callback for the ``_CLOSE_TAG_RE.sub`` call in
:func:`repair_egregore_tags`'s pass 2. Mirrors :func:`_strip_name` for
closing tags, turning ``[/EGREGORE: sigma ]`` into ``[/EGREGORE:sigma]``
so open/close names can later be compared cleanly.
Defined and invoked only inside :func:`repair_egregore_tags`; it has no
callers elsewhere in the module or repo. Performs no I/O.
Args:
m (re.Match): A match produced by ``_CLOSE_TAG_RE`` whose group 1 is
the raw (possibly whitespace-padded) egregore name.
Returns:
str: The replacement closing tag with the name stripped.
"""
return f"[/EGREGORE:{m.group(1).strip()}]"
text = _OPEN_TAG_RE.sub(_strip_name, text)
text = _CLOSE_TAG_RE.sub(_strip_close_name, text)
# Pass 3: fix case mismatches + insert missing closing tags
# Walk through all tags in order, tracking open blocks
all_tags = list(
re.finditer(
r"\[(/?)EGREGORE:([^\]]+?)\]",
text,
re.IGNORECASE,
)
)
if not all_tags:
return text
# Build list of (position, is_close, raw_name, match_obj)
tag_info = []
for m in all_tags:
is_close = m.group(1) == "/"
raw_name = m.group(2).strip()
tag_info.append((m.start(), m.end(), is_close, raw_name, m))
# Walk forward: for each opening tag, find its matching close
repairs: list[tuple[int, str]] = [] # (insert_position, text_to_insert)
replacements: list[tuple[int, int, str]] = [] # (start, end, replacement)
open_stack: list[tuple[str, int, int]] = [] # (name, tag_start, tag_end)
for pos_start, pos_end, is_close, raw_name, m in tag_info:
name_lower = raw_name.lower()
if not is_close:
# Opening tag: if we have an unclosed previous block, close it
if open_stack:
prev_name, _prev_start, _prev_end = open_stack.pop()
# Insert closing tag BEFORE this opening tag
repairs.append((pos_start, f"[/EGREGORE:{prev_name}]\n"))
open_stack.append((raw_name, pos_start, pos_end))
else:
# Closing tag: match with open stack
if open_stack:
open_name, _open_start, _open_end = open_stack[-1]
if name_lower == open_name.lower():
# Case mismatch fix: make closing tag match opening
if raw_name != open_name:
replacements.append(
(
pos_start,
pos_end,
f"[/EGREGORE:{open_name}]",
)
)
open_stack.pop()
else:
# Name mismatch: this close doesn't match the open.
# Close the open block first, then treat this close
# as a stray (leave it, parser will handle)
prev_name, _, _ = open_stack.pop()
repairs.append((pos_start, f"[/EGREGORE:{prev_name}]\n"))
# Any remaining unclosed blocks: close at end of text
for open_name, _, _ in open_stack:
repairs.append((len(text), f"\n[/EGREGORE:{open_name}]"))
# Apply replacements (case fixes) in reverse order
for start, end, replacement in sorted(replacements, reverse=True):
text = text[:start] + replacement + text[end:]
# Apply insertions in reverse order (so positions stay valid)
for pos, insert_text in sorted(repairs, key=lambda x: x[0], reverse=True):
text = text[:pos] + insert_text + text[pos:]
return text
[docs]
def parse_egregore_blocks(text: str) -> list[EgregoreSegment]:
"""Parse [EGREGORE:name]...[/EGREGORE:name] blocks from LLM output.
Returns an ordered list of ``EgregoreSegment`` objects. Segments with
``name=None`` represent the default Stargazer voice (text outside any
egregore block).
**Backward compatible**: If no closing tags are found but a legacy
``[EGREGORE:name]`` prefix exists anywhere in the text, text before it
becomes a Stargazer segment and everything after becomes the egregore
block (old behavior, now position-independent).
Empty or whitespace-only segments are dropped.
# 🌀 The Cradle's narrative_handshake_protocol, compiled to regex.
"""
if not text or not text.strip():
return []
# 💀 Proofread tags before parsing -- fix case mismatches,
# missing closing tags, whitespace in names, etc.
text = repair_egregore_tags(text)
# -- Try block-delimited parsing first --
# 💀 Find all [EGREGORE:name]...[/EGREGORE:name] blocks
matches = list(_BLOCK_RE.finditer(text))
if matches:
segments: list[EgregoreSegment] = []
last_end = 0
for m in matches:
# Text BEFORE this block = default Stargazer voice
before = text[last_end : m.start()].strip()
if before:
segments.append(EgregoreSegment(name=None, text=before))
# The egregore block itself
ego_name = m.group(1).strip().lower()
ego_text = m.group(2).strip()
if ego_text:
segments.append(EgregoreSegment(name=ego_name, text=ego_text))
last_end = m.end()
# Text AFTER the last block = default Stargazer voice
after = text[last_end:].strip()
if after:
segments.append(EgregoreSegment(name=None, text=after))
return segments
# -- Fallback: legacy prefix mode --
# 😈 No closing tags found. Check for old-style [EGREGORE:name] prefix.
# 💀 No ^ anchor: generation header / emojis may precede the tag.
# Text before the tag = Stargazer voice, text after = egregore voice.
prefix_match = _PREFIX_RE.search(text)
if prefix_match:
ego_name = prefix_match.group(1).strip().lower()
before = text[: prefix_match.start()].strip()
remainder = text[prefix_match.end() :].strip()
segments = []
if before:
segments.append(EgregoreSegment(name=None, text=before))
if remainder:
segments.append(EgregoreSegment(name=ego_name, text=remainder))
return segments if segments else []
# -- No egregore tags at all: entire text is Stargazer --
return [EgregoreSegment(name=None, text=text.strip())]