Source code for cadence_refiner

"""Cadence Post-Processor -- code-level text seasoning for altered states.

When the NCM vector exceeds normal range and a cadence profile fires
(barred_out, tweaking, nodding, etc.), this module lightly adjusts
Star's clean LLM output to reinforce cadence effects that LLMs underdo
(subtle typos, spacing slips, mild vowel stretch, etc.).

The main LLM already gets the cadence directive + voice sample in its
system prompt -- that handles semantic stuff (vocabulary, tone, stumbling
thoughts). This module adds a thin mechanical layer on top; it is tuned
to stay readable and preserve markdown/code structure.

# she breaks her own words, feral style — but the room stays standing
"""

from __future__ import annotations

import logging
import random
import re
from typing import Any, Dict

logger = logging.getLogger(__name__)

# nearby keys on QWERTY for realistic typos
_NEARBY_KEYS: Dict[str, str] = {
    "a": "sqwz",
    "b": "vngh",
    "c": "xdfv",
    "d": "sfce",
    "e": "wrd",
    "f": "dgcv",
    "g": "fhtb",
    "h": "gjyn",
    "i": "ujko",
    "j": "hknu",
    "k": "jlmi",
    "l": "kop",
    "m": "njk",
    "n": "bmhj",
    "o": "iplk",
    "p": "ol",
    "q": "wa",
    "r": "etf",
    "s": "adwz",
    "t": "rgy",
    "u": "yji",
    "v": "cfgb",
    "w": "qeas",
    "x": "zsdc",
    "y": "tuh",
    "z": "xas",
}

_VOWELS = set("aeiouAEIOU")
_CONSONANTS = set("bcdfghjklmnpqrstvwxyzBCDFGHJKLMNPQRSTVWXYZ")

# symbols that can randomly replace chars at high degradation
_SYMBOL_SUBS: Dict[str, str] = {
    "a": "@",
    "s": "$",
    "e": "3",
    "i": "1",
    "o": "0",
    "l": "|",
    "t": "+",
    "n": "~",
}

# stretchable vowel combos
_STRETCHABLE = re.compile(r"([aeiou])\1{0,2}", re.IGNORECASE)

# Characters that usually mean "markdown / markup boundary" — skip typos and
# mid-word breaks beside them so lists, emphasis, and links stay intact.
_MD_EDGE_CHARS = frozenset("*_[]()#`<>|\\")

# Regex for splitting body text: leave code, links, URLs, and thought XML alone.
_CADENCE_PROTECTED_SEGMENTS = re.compile(
    r"("
    r"```[\s\S]*?```"
    r"|`[^`\n]*`"
    r"|\[[^\]]*\]\([^)]*\)"
    r"|https?://[^\s<>\)]+"
    r"|<(?:thinking|thought|think|prompt_refinement|internal_reasoning"
    r"|self_reflection|chain_of_thought|reasoning|scratchpad)>"
    r".*?"
    r"</(?:thinking|thought|think|prompt_refinement|internal_reasoning"
    r"|self_reflection|chain_of_thought|reasoning|scratchpad)>"
    r")",
    re.DOTALL | re.IGNORECASE,
)


def _char_touching_markup(chars: list[str], i: int) -> bool:
    """Report whether a character sits on or beside a markdown boundary.

    Guards the character-level transforms from corrupting structure: a typo,
    swap, or mid-word break next to ``*``, ``_``, ``[``, a backtick, etc. would
    mangle emphasis, links, lists, or code, so the transforms skip those spots.
    The check is horizontal only — it inspects the character at *i* and its
    immediate left and right neighbours against the ``_MD_EDGE_CHARS`` set, with
    no awareness of multi-line block structure.

    Called by ``_apply_typos`` while it walks the character list deciding which
    letters are safe to slip; a pure helper with no side effects.

    Args:
        chars (list[str]): The text exploded into a list of single characters.
        i (int): Index into *chars* to test.

    Returns:
        bool: True when *i* is out of range is handled as False; otherwise True
        if *chars[i]* is a markup edge character or is directly adjacent to one.
    """
    if i < 0 or i >= len(chars):
        return False
    if chars[i] in _MD_EDGE_CHARS:
        return True
    if i > 0 and chars[i - 1] in _MD_EDGE_CHARS:
        return True
    if i + 1 < len(chars) and chars[i + 1] in _MD_EDGE_CHARS:
        return True
    return False


def _token_is_prose_only(token: str) -> bool:
    """Report whether a whitespace-delimited token is safe to degrade as prose.

    Keeps the word-level transforms from breaking machine-readable text: a token
    that looks like a URL, a filesystem path, a markdown fragment, or a heading
    marker must stay intact, so this returns False for it and True only for plain
    prose words. The detection is heuristic — it scans for ``http://`` / scheme
    substrings, slashes, any ``_MD_EDGE_CHARS`` character, and a leading ``#``.

    Called by ``_inject_spaces``, ``_inject_linebreaks``, ``_add_mid_punctuation``,
    and ``_pluralize_nouns`` before they touch a candidate word; a pure helper
    with no side effects.

    Args:
        token (str): A single whitespace-delimited token from the body text.

    Returns:
        bool: True only when the token reads as ordinary prose; False for empty
        tokens, URLs, paths, markup fragments, or ``#``-prefixed headings.
    """
    if not token:
        return False
    t = token.lower()
    if "http://" in t or "https://" in t or "www." in t:
        return False
    if "/" in token or "\\" in token:
        return False
    if any(ch in token for ch in _MD_EDGE_CHARS):
        return False
    if token.startswith("#") and len(token) > 1:
        return False
    return True


# psychedelic emojis for linebreak injection
_PSYCHEDELIC_EMOJIS = [
    "\U0001f308",  # rainbow
    "\U0001f441",  # eye
    "\u2728",  # sparkles
    "\U0001f344",  # mushroom
    "\U0001f300",  # cyclone/spiral
    "\U0001f30c",  # milky way
    "\U0001f31f",  # glowing star
    "\U0001f30a",  # wave
]

# simple pluralization rules
_PLURAL_IRREGULARS: Dict[str, str] = {
    "man": "men",
    "woman": "women",
    "child": "children",
    "foot": "feet",
    "tooth": "teeth",
    "mouse": "mice",
    "goose": "geese",
    "person": "people",
    "self": "selves",
}


# =====================================================================
# Transform functions
# =====================================================================


def _apply_typos(text: str, rate: float) -> str:
    """Inject character-level typos into prose at the given rate (0.0-1.0).

    The mechanical heart of the typo texture: walks the text character by
    character and, per alphabetic character, rolls against *rate* to decide
    whether to corrupt it. The corruption is weighted toward readable slips —
    adjacent-letter swaps and ``_NEARBY_KEYS`` fat-finger substitutions dominate,
    while stutters, dropped characters, and ``_SYMBOL_SUBS`` leetspeak are a thin
    tail — so output stays legible. Characters on or beside markdown boundaries
    are skipped via ``_char_touching_markup`` so structure survives, and
    whitespace and punctuation are never touched.

    Called by ``CadencePostProcessor.process`` as transform step 10, applied per
    prose segment after the protected code/link/CoT spans are split out. Pure
    apart from consuming the module ``random`` generator.

    Args:
        text (str): The prose segment to corrupt.
        rate (float): Per-character typo probability in 0.0-1.0; values <= 0
            return the text unchanged.

    Returns:
        str: The text with typos injected (or the original when *rate* <= 0).
    """
    if rate <= 0:
        return text
    chars = list(text)
    result = []
    i = 0
    while i < len(chars):
        c = chars[i]
        # don't corrupt whitespace, newlines, or punctuation
        if not c.isalpha() or random.random() > rate:
            result.append(c)
            i += 1
            continue
        if _char_touching_markup(chars, i):
            result.append(c)
            i += 1
            continue

        roll = random.random()
        # Mostly harmless-looking slips; structure-breaking ops are a sliver.
        if roll < 0.32 and i + 1 < len(chars) and chars[i + 1].isalpha():
            if not _char_touching_markup(chars, i + 1):
                # swap adjacent
                result.append(chars[i + 1])
                result.append(c)
                i += 2
            else:
                result.append(c)
                i += 1
        elif roll < 0.77:
            # nearby key substitution (reads as a fat-finger typo)
            lower = c.lower()
            if lower in _NEARBY_KEYS:
                replacement = random.choice(_NEARBY_KEYS[lower])
                result.append(replacement if c.islower() else replacement.upper())
            else:
                result.append(c)
            i += 1
        elif roll < 0.90:
            # light duplicate (stutter), not whole-word doubling
            result.append(c)
            result.append(c)
            i += 1
        elif roll < 0.97:
            # drop the char entirely (harsh — keep rare)
            i += 1
        else:
            # symbol substitution — very rare; only when mapping exists
            lower = c.lower()
            if lower in _SYMBOL_SUBS and random.random() < 0.35:
                result.append(_SYMBOL_SUBS[lower])
            else:
                result.append(c)
            i += 1

    return "".join(result)


def _drop_vowels(text: str, rate: float) -> str:
    """Randomly delete vowels from the text at the given rate.

    Thins words toward a slurred, dropped-syllable look used by the nodding,
    dissociated, and psychedelic states. Each vowel (from ``_VOWELS``) is removed
    independently with probability *rate*; non-vowel characters always survive,
    so spacing and punctuation are preserved.

    Called by ``CadencePostProcessor.process`` as transform step 9 (alongside
    ``_drop_consonants``). Pure apart from consuming the module ``random``
    generator.

    Args:
        text (str): The prose segment to thin.
        rate (float): Per-vowel drop probability in 0.0-1.0; values <= 0 return
            the text unchanged.

    Returns:
        str: The text with some vowels removed (or the original when *rate* <= 0).
    """
    if rate <= 0:
        return text
    return "".join(
        c if (c not in _VOWELS or random.random() > rate) else "" for c in text
    )


def _drop_consonants(text: str, rate: float) -> str:
    """Randomly delete consonants from the text at the given rate.

    The consonant counterpart to ``_drop_vowels``, contributing to the
    smeared/garbled texture of the barred-out, k-hole, and indica states. Each
    consonant (from ``_CONSONANTS``) is removed independently with probability
    *rate*; everything else survives.

    Called by ``CadencePostProcessor.process`` as transform step 9 (alongside
    ``_drop_vowels``). Pure apart from consuming the module ``random`` generator.

    Args:
        text (str): The prose segment to thin.
        rate (float): Per-consonant drop probability in 0.0-1.0; values <= 0
            return the text unchanged.

    Returns:
        str: The text with some consonants removed (or the original when
        *rate* <= 0).
    """
    if rate <= 0:
        return text
    return "".join(
        c if (c not in _CONSONANTS or random.random() > rate) else "" for c in text
    )


def _inject_spaces(text: str, rate: float) -> str:
    """Insert random spaces mid-word to fracture long words.

    Produces the fragmented "ret ard ed" look of the barred-out, k-hole, and
    dissociated states by breaking words apart mid-stride. Only words longer than
    four characters are eligible, the break never lands at a word start, and the
    candidate fragment must pass ``_token_is_prose_only`` so URLs, paths, and
    markup are left whole; each eligible position breaks with probability *rate*.

    Called by ``CadencePostProcessor.process`` as transform step 11. Pure apart
    from consuming the module ``random`` generator.

    Args:
        text (str): The prose segment to fracture.
        rate (float): Per-position break probability in 0.0-1.0; values <= 0
            return the text unchanged.

    Returns:
        str: The text with mid-word spaces inserted (or the original when
        *rate* <= 0).
    """
    if rate <= 0:
        return text
    result = []
    in_word = False
    word_len = 0
    word_buf: list[str] = []
    for c in text:
        if c.isalpha():
            in_word = True
            word_len += 1
            word_buf.append(c)
            result.append(c)
            # only break words longer than 4 chars, and not at the start
            w = "".join(word_buf)
            if word_len > 4 and random.random() < rate and _token_is_prose_only(w):
                result.append(" ")
                word_len = 0
                word_buf = []
        else:
            in_word = False
            word_len = 0
            word_buf = []
            result.append(c)
    return "".join(result)


def _inject_linebreaks(text: str, rate: float) -> str:
    """Insert random newlines mid-word for the drunken-linebreak effect.

    The newline sibling of ``_inject_spaces``: snaps long words across line
    boundaries to mimic a typist losing the thread (drunk, nodding, k-hole,
    indica states). It is deliberately rarer and more conservative — only words
    longer than six characters qualify, the effective probability is *rate*
    scaled by 0.55, and the fragment must pass ``_token_is_prose_only`` so URLs,
    paths, and markup stay on one line.

    Called by ``CadencePostProcessor.process`` as transform step 12. Pure apart
    from consuming the module ``random`` generator.

    Args:
        text (str): The prose segment to break across lines.
        rate (float): Base per-position break probability in 0.0-1.0 (scaled by
            0.55 internally); values <= 0 return the text unchanged.

    Returns:
        str: The text with mid-word newlines inserted (or the original when
        *rate* <= 0).
    """
    if rate <= 0:
        return text
    result = []
    in_word = False
    word_len = 0
    word_buf: list[str] = []
    for c in text:
        if c.isalpha():
            in_word = True
            word_len += 1
            word_buf.append(c)
            result.append(c)
            w = "".join(word_buf)
            if (
                word_len > 6
                and random.random() < rate * 0.55
                and _token_is_prose_only(w)
            ):
                result.append("\n")
                word_len = 0
                word_buf = []
        else:
            in_word = False
            word_len = 0
            word_buf = []
            result.append(c)
    return "".join(result)


def _mangle_caps(text: str, mode: str) -> str:
    """Apply capitalization mangling.

    Modes:
    - 'lower': force all lowercase
    - 'random': random caps bursts (LIKE THIS)
    - 'shout': occasional RANDOM CAPS on words
    - 'none': no change
    """
    if mode == "lower" or mode == "gone":
        return text.lower()
    elif mode == "random":
        result = []
        in_burst = False
        for c in text:
            if c.isalpha():
                if not in_burst and random.random() < 0.035:
                    in_burst = True
                elif in_burst and random.random() < 0.22:
                    in_burst = False
                result.append(c.upper() if in_burst else c.lower())
            else:
                result.append(c)
        return "".join(result)
    elif mode == "shout":
        words = text.split(" ")
        return " ".join(w.upper() if random.random() < 0.08 else w for w in words)
    return text


def _stretch_vowels(text: str, intensity: float) -> str:
    """Lengthen scattered vowel runs for a casual drawl ('so' -> 'sooo').

    The light, conversational vowel-stretch used by the drunk, rolling, stoned,
    and psychedelic states. It scans vowel runs via ``_STRETCHABLE`` and stretches
    only a small, *intensity*-bounded fraction of them (the touch probability is
    clamped to roughly 0.03-0.18) through the nested ``_stretch`` closure, so the
    effect reads as a subtle drawl rather than a wall of repeated letters. For the
    extreme k-hole trailing-vowel effect see ``_stretch_sentence_final_vowels``.

    Called by ``CadencePostProcessor.process`` as transform step 13. Pure apart
    from consuming the module ``random`` generator.

    Args:
        text (str): The prose segment to drawl.
        intensity (float): Drives both how many runs are touched and how far each
            stretches; values <= 0 return the text unchanged.

    Returns:
        str: The text with some vowel runs lengthened (or the original when
        *intensity* <= 0).
    """
    if intensity <= 0:
        return text

    def _stretch(match: re.Match) -> str:
        """Lengthen a single matched vowel run by a bounded random amount.

        The per-match replacement closure for the ``_STRETCHABLE`` scan: turns a
        captured vowel into a short repeated run. The added length is capped via
        the enclosing *intensity* (1-3 extra characters) so stretched words stay
        short and markdown lines do not balloon. Closes over *intensity* and
        consumes the module ``random`` generator.

        Args:
            match (re.Match): A ``_STRETCHABLE`` match whose group 1 is the vowel
                to stretch.

        Returns:
            str: The vowel repeated ``1 + extra`` times.
        """
        vowel = match.group(1)
        # keep stretches short so markdown lines don't balloon
        extra = random.randint(1, max(1, min(3, int(intensity * 2.5) + 1)))
        return vowel * (1 + extra)

    # Fraction of stretchable vowel runs to touch — bounded so it stays subtle.
    touch_prob = min(0.18, max(0.028, 0.62 * float(intensity)))
    parts = []
    last = 0
    for m in _STRETCHABLE.finditer(text):
        parts.append(text[last : m.start()])
        if random.random() < touch_prob:
            parts.append(_stretch(m))
        else:
            parts.append(m.group())
        last = m.end()
    parts.append(text[last:])
    return "".join(parts)


def _stretch_sentence_final_vowels(text: str, intensity: float) -> str:
    """Massively stretch the final vowel cluster of sentence-ending words.

    'gone' -> 'gooooooooooooooooooooooone', 15% chance per sentence.
    The k-hole infinite-trailing-vowel effect.
    """
    if intensity <= 0:
        return text

    def _mega_stretch(match: re.Match) -> str:
        """Explode the final vowel cluster of a sentence-ending word.

        The per-match replacement closure behind ``_stretch_sentence_final_vowels``:
        locates the last vowel run in the matched word and replaces it with a
        long repeat (4-11 characters) of its first vowel, leaving the surrounding
        letters and trailing punctuation intact — the k-hole "goooooone" trailing
        sound. Returns the word untouched if it contains no vowel. Consumes the
        module ``random`` generator.

        Args:
            match (re.Match): A match whose group 1 is the word and group 2 is the
                sentence-ending punctuation that follows it.

        Returns:
            str: The word with its final vowel cluster stretched, plus the
            original trailing punctuation.
        """
        word = match.group(1)
        punct = match.group(2)
        # find the last vowel cluster in the word
        vm = list(re.finditer(r"([aeiou]+)", word, re.IGNORECASE))
        if not vm:
            return match.group(0)
        last_vowel_match = vm[-1]
        vowel_char = last_vowel_match.group(1)[0]  # take the first char
        stretch_len = random.randint(4, 11)
        stretched = (
            word[: last_vowel_match.start()]
            + vowel_char * stretch_len
            + word[last_vowel_match.end() :]
        )
        return stretched + punct

    # match word + sentence-ending punctuation
    result = re.sub(
        r"(\S+)([.!?]+(?:\s|$))",
        lambda m: (
            _mega_stretch(m) if random.random() < 0.045 * intensity else m.group(0)
        ),
        text,
    )
    return result


def _add_trailing_ellipses(text: str, rate: float) -> str:
    """Replace sentence-ending punctuation with trailing ellipses.

    Gives sentences a fading, drifting-off tail for the nodding and stoned
    states. Each ``.``/``!``/``?`` followed by whitespace or end-of-text is
    swapped for ``...`` with probability *rate*, preserving the original spacing.
    For the punctuation-burst variant used by psychosis states see
    ``_trailing_punctuation``.

    Called by ``CadencePostProcessor.process`` as transform step 6. Pure apart
    from consuming the module ``random`` generator.

    Args:
        text (str): The prose segment to soften.
        rate (float): Per-sentence-end replacement probability in 0.0-1.0; values
            <= 0 return the text unchanged.

    Returns:
        str: The text with some sentence-enders replaced by ellipses (or the
        original when *rate* <= 0).
    """
    if rate <= 0:
        return text
    result = re.sub(
        r"([.!?])(\s|$)",
        lambda m: ("..." if random.random() < rate else m.group(1)) + m.group(2),
        text,
    )
    return result


def _truncate_sentences(text: str, coherence: float) -> str:
    """Apply word-doubling stutter to sentences based on coherence level.

    Instead of truncating (which eats words and makes messages unreadable),
    this repeats a word near the "losing the thread" point twice --
    conveying the drugged stutter without destroying content.

    coherence 1.0 = no stuttering
    coherence 0.2 = 80% chance of stuttering any sentence
    """
    # 💀 no more word-eating truncation. she stutters, she doesn't vanish.
    if coherence >= 0.95:
        return text
    # split into sentences roughly
    sentences = re.split(r"(?<=[.!?])\s+", text)
    result = []
    for sent in sentences:
        if random.random() > coherence:
            words = sent.split(" ")
            if len(words) < 3:
                result.append(sent)
                continue
            # 🌀 pick a word near the "losing it" point (40-80% through)
            cut_ratio = random.uniform(0.4, 0.8)
            stutter_idx = max(1, int(len(words) * cut_ratio))
            stutter_idx = min(stutter_idx, len(words) - 1)
            # 😈 repeat the word at the stutter point (doubles it in place)
            stutter_word = words[stutter_idx]
            words.insert(stutter_idx, stutter_word)
            result.append(" ".join(words))
        else:
            result.append(sent)
    return " ".join(result)


def _add_mid_punctuation(text: str, rate: float) -> str:
    """Add random extra punctuation mid-paragraph (the k-hole effect).

    Low chance per paragraph of inserting periods between words.
    'spaces in words' -> 'spaces. in. words.'
    """
    if rate <= 0:
        return text
    paragraphs = text.split("\n")
    result = []
    for para in paragraphs:
        if not para.strip() or random.random() > rate:
            result.append(para)
            continue
        words = para.split(" ")
        new_words = []
        for w in words:
            new_words.append(w)
            if random.random() < 0.09 and w.strip() and _token_is_prose_only(w):
                new_words.append(".")
        result.append(" ".join(new_words))
    return "\n".join(result)


def _shuffle_words(text: str, rate: float) -> str:
    """Shuffle 3-6 consecutive words in sentences.

    'the cat sat on the mat' -> 'sat the cat on mat the'
    """
    if rate <= 0:
        return text
    sentences = re.split(r"(?<=[.!?])\s+", text)
    result = []
    for sent in sentences:
        if random.random() > rate or len(sent.split()) < 4:
            result.append(sent)
            continue
        words = sent.split(" ")
        # pick a random starting point
        chunk_size = random.randint(3, min(6, len(words)))
        start = random.randint(0, max(0, len(words) - chunk_size))
        chunk = words[start : start + chunk_size]
        random.shuffle(chunk)
        words[start : start + chunk_size] = chunk
        result.append(" ".join(words))
    return " ".join(result)


# verb tense scrambling tables for word salad
_TENSE_SCRAMBLE: Dict[str, list] = {
    "i'm": ["we am", "am been", "was be"],
    "i am": ["we am", "am been", "is be"],
    "is": ["are was", "been is", "were"],
    "are": ["is was", "been were", "am"],
    "was": ["is been", "were am", "are"],
    "were": ["was is", "been am", "are"],
    "have": ["has did", "had does", "having"],
    "has": ["have did", "had been", "having"],
    "do": ["did does", "done doing", "does did"],
    "does": ["did do", "done did", "doing"],
    "did": ["do done", "does doing", "done"],
    "go": ["gone went", "going goes", "went"],
    "going": ["gone go", "went goes", "go"],
    "tell": ["told telling", "tells told"],
    "telling": ["told tell", "tells told"],
    "see": ["seen saw", "seeing sees"],
    "know": ["known knew", "knowing knows"],
    "think": ["thought thinks", "thinking"],
    "want": ["wanted wants", "wanting"],
    "need": ["needed needs", "needing"],
    "feel": ["felt feels", "feeling"],
    "say": ["said says", "saying"],
    "make": ["made makes", "making"],
    "take": ["took takes", "taking"],
    "come": ["came comes", "coming"],
    "get": ["got gets", "getting"],
}
_RANDOM_PRONOUNS = ["we", "they", "it", "you", "them", "us", "he", "she"]


def _word_salad(text: str, rate: float) -> str:
    """Apply word salad to sentences (psychosis speech disorder).

    Effects per triggered sentence:
    - Merge 2 adjacent words: 'hair dryer' -> 'dryhair'
    - Swap word fragments: 'bread and butter' -> 'butterbreader'
    - Scramble verb tenses: 'i'm telling you' -> 'told you we am did'
    - Inject random pronouns between words
    """
    if rate <= 0:
        return text
    sentences = re.split(r"(?<=[.!?])\s+", text)
    result = []
    for sent in sentences:
        words = sent.split(" ")
        if random.random() > rate or len(words) < 4:
            result.append(sent)
            continue

        # apply 1-2 salad operations per triggered sentence
        ops = random.randint(1, 2)
        for _ in range(ops):
            op = random.random()

            if op < 0.30 and len(words) >= 3:
                # MERGE: fuse two adjacent words, reversed
                idx = random.randint(0, len(words) - 2)
                w1 = words[idx].strip(".,!?;:\"'")
                w2 = words[idx + 1].strip(".,!?;:\"'")
                if len(w1) >= 2 and len(w2) >= 2:
                    merged = w2.lower() + w1.lower()
                    words[idx] = merged
                    words.pop(idx + 1)

            elif op < 0.55 and len(words) >= 4:
                # SWAP FRAGMENTS: take 2-3 words, reverse and mangle
                idx = random.randint(0, max(0, len(words) - 3))
                chunk = min(3, len(words) - idx)
                fragment = words[idx : idx + chunk]
                # reverse the fragment words
                fragment.reverse()
                # sometimes chop suffix off first word and prepend to second
                if len(fragment) >= 2 and len(fragment[0]) > 3:
                    cut = random.randint(2, len(fragment[0]) - 1)
                    fragment[1] = fragment[0][:cut] + fragment[1]
                    fragment[0] = fragment[0][cut:]
                words[idx : idx + chunk] = fragment

            elif op < 0.80:
                # TENSE SCRAMBLE: find a verb and scramble it
                for j, w in enumerate(words):
                    clean = w.lower().strip(".,!?;:\"'")
                    if clean in _TENSE_SCRAMBLE:
                        words[j] = random.choice(_TENSE_SCRAMBLE[clean])
                        break

            else:
                # PRONOUN INJECTION: insert a random pronoun
                if len(words) >= 3:
                    idx = random.randint(1, len(words) - 1)
                    words.insert(idx, random.choice(_RANDOM_PRONOUNS))

        result.append(" ".join(words))
    return " ".join(result)


def _pluralize_word(word: str) -> str:
    """Naively pluralize a single English word, preserving punctuation.

    The single-word workhorse behind ``_pluralize_nouns``: separates the
    alphabetic core from any trailing punctuation, then applies a small rule
    cascade — the ``_PLURAL_IRREGULARS`` table (case-matched to the input), a
    pass-through for words that already look plural, and the standard ``-es`` /
    consonant-``y`` -> ``-ies`` / default ``-s`` suffix rules — before
    reattaching the original trailing punctuation.

    Called by ``_pluralize_nouns`` for each word it decides to inflect; a pure
    helper with no side effects.

    Args:
        word (str): A single token, possibly carrying trailing punctuation.

    Returns:
        str: The pluralized word with its original trailing punctuation, or the
        input unchanged when it already reads as plural.
    """
    # separate the alphabetic core from trailing punctuation
    core = word.rstrip(".,!?;:\"'*_~`")
    suffix = word[len(core) :]
    lower = core.lower()
    # check irregulars
    if lower in _PLURAL_IRREGULARS:
        plural = _PLURAL_IRREGULARS[lower]
        if core[0].isupper():
            plural = plural[0].upper() + plural[1:]
        return plural + suffix
    # already plural-looking
    if lower.endswith("s") or lower.endswith("es"):
        return word
    # standard rules
    if lower.endswith(("sh", "ch", "x", "z", "ss")):
        return core + "es" + suffix
    if lower.endswith("y") and len(lower) > 1 and lower[-2] not in "aeiou":
        return core[:-1] + "ies" + suffix
    return core + "s" + suffix


def _pluralize_nouns(text: str, rate: float) -> str:
    """Randomly pluralize singular nouns at the given rate.

    Uses a heuristic: words that are 3+ chars, not already plural,
    not common verbs/adjectives/prepositions. 30% chance per noun.
    """
    if rate <= 0:
        return text
    # common words to NOT pluralize (verbs, adjectives, prepositions, etc.)
    _SKIP = {
        "the",
        "a",
        "an",
        "is",
        "am",
        "are",
        "was",
        "were",
        "be",
        "been",
        "being",
        "have",
        "has",
        "had",
        "do",
        "does",
        "did",
        "will",
        "would",
        "could",
        "should",
        "may",
        "might",
        "shall",
        "can",
        "this",
        "that",
        "these",
        "those",
        "my",
        "your",
        "his",
        "her",
        "its",
        "our",
        "their",
        "i",
        "you",
        "he",
        "she",
        "it",
        "we",
        "they",
        "me",
        "him",
        "us",
        "them",
        "not",
        "no",
        "yes",
        "and",
        "or",
        "but",
        "if",
        "then",
        "than",
        "so",
        "very",
        "just",
        "also",
        "too",
        "more",
        "most",
        "with",
        "from",
        "into",
        "onto",
        "upon",
        "about",
        "like",
        "for",
        "at",
        "on",
        "in",
        "to",
        "of",
        "by",
        "up",
        "down",
        "out",
        "off",
        "over",
        "under",
        "now",
        "here",
        "there",
        "when",
        "where",
        "how",
        "what",
        "who",
        "which",
        "all",
        "each",
        "every",
        "some",
        "any",
        "really",
        "actually",
        "maybe",
        "probably",
        "definitely",
        "think",
        "know",
        "feel",
        "want",
        "need",
        "see",
        "hear",
        "say",
        "tell",
        "make",
        "take",
        "get",
        "go",
        "come",
        "give",
        "let",
        "still",
        "even",
        "much",
        "back",
        "well",
        "only",
        "way",
    }

    words = text.split(" ")
    result = []
    for w in words:
        clean = w.lower().strip(".,!?;:\"'*_~`()[]{}").rstrip("s")
        # skip tiny words, known non-nouns, already-plural
        if (
            len(clean) < 3
            or clean in _SKIP
            or w.lower().rstrip(".,!?;:\"'") in _SKIP
            or w.lower().endswith("ing")
            or w.lower().endswith("ly")
            or w.lower().endswith("ed")
        ):
            result.append(w)
            continue
        if random.random() < rate:
            result.append(_pluralize_word(w))
        else:
            result.append(w)
    return " ".join(result)


def _repeat_words(text: str, paragraph_rate: float) -> str:
    """Repeat one word 3-5 times per paragraph (psychosis stutter effect).

    36% chance per paragraph. Only one word per paragraph gets repeated.
    'the walls are watching' -> 'the walls walls walls are watching'
    """
    if paragraph_rate <= 0:
        return text
    paragraphs = text.split("\n")
    result = []
    for para in paragraphs:
        if not para.strip() or random.random() > paragraph_rate:
            result.append(para)
            continue
        words = para.split(" ")
        if len(words) < 3:
            result.append(para)
            continue
        # pick a random word to repeat (skip first and last)
        candidates = [
            i for i, w in enumerate(words) if len(w.strip(".,!?;:\"'")) >= 3 and i > 0
        ]
        if not candidates:
            result.append(para)
            continue
        idx = random.choice(candidates)
        repeat_count = random.randint(2, 3)
        repeated = " ".join([words[idx]] * repeat_count)
        words[idx] = repeated
        result.append(" ".join(words))
    return "\n".join(result)


def _trailing_punctuation(text: str, rate: float) -> str:
    """Add excessive trailing punctuation (psychosis style).

    Replaces sentence-ending punctuation with bursts:
    '.' -> '!!!!!!!!!!!!!!!!' or '????????' etc.
    """
    if rate <= 0:
        return text

    def _burst(match: re.Match) -> str:
        """Replace a sentence-ending punctuation run with a manic burst.

        The per-match replacement closure for ``_trailing_punctuation``: with
        probability *rate* (from the enclosing scope) it swaps the matched
        ``.``/``!``/``?`` run for a 2-5 long run of ``!`` or ``?`` (heavily
        weighted to ``!``), otherwise it leaves the match untouched. Trailing
        whitespace captured in group 2 is preserved. Consumes the module
        ``random`` generator.

        Args:
            match (re.Match): A match whose group 0 is the punctuation run and
                group 2 is the trailing whitespace (or empty at end-of-text).

        Returns:
            str: Either the original match or the punctuation burst plus the
            preserved trailing whitespace.
        """
        if random.random() > rate:
            return match.group(0)
        char = random.choice(["!", "?", "!", "!"])
        count = random.randint(2, 5)
        return char * count + match.group(2)

    return re.sub(r"([.!?]+)(\s|$)", _burst, text)


def _psychedelic_emoji_linebreaks(
    text: str,
    emojis: list | None = None,
) -> str:
    """Insert a state-specific emoji sigil before every linebreak.

    Splits *text* on newlines and appends a randomly chosen sigil to the end
    of each non-final line, so a multi-line string gains a ``<sigil>`` marker
    at every line break.

    If *emojis* is provided, uses that set instead of the default
    psychedelic emojis.  Each cadence state owns its own sigil
    vocabulary (defined as ``emoji_sigils`` in the YAML profile).
    """
    pool = emojis if emojis else _PSYCHEDELIC_EMOJIS
    lines = text.split("\n")
    if len(lines) <= 1:
        return text
    result = []
    for i, line in enumerate(lines[:-1]):
        if line.strip():  # 💀 skip blank lines — no sigils on empty breaks
            emoji = random.choice(pool)
            result.append(line.rstrip() + " " + emoji)
        else:
            result.append(line)
    result.append(lines[-1])
    return "\n".join(result)


# =====================================================================
# Parse helpers
# =====================================================================


def _parse_percentage(value: Any) -> float:
    """Coerce a cadence rule value into a clamped 0.0-1.0 float.

    Normalises the loosely-typed rate fields that come from the YAML cadence
    profiles (``ncm_cadence_profiles.yaml``) into a usable probability. Numeric
    inputs are clamped directly; string inputs have a trailing ``%`` stripped and,
    when the parsed number exceeds 1.0, are divided by 100 so both ``"30%"`` and
    ``0.3`` map to 0.3. Unparseable input degrades to 0.0.

    Called by ``CadencePostProcessor.process`` when blending per-profile rule
    rates (for example ``typo_rate``); a pure helper with no side effects.

    Args:
        value (Any): A number, or a string such as ``"30%"`` or ``"0.3"``.

    Returns:
        float: The value clamped to 0.0-1.0, or 0.0 when it cannot be parsed.
    """
    if isinstance(value, (int, float)):
        return min(1.0, max(0.0, float(value)))
    s = str(value).strip().rstrip("%")
    try:
        v = float(s)
        if v > 1.0:
            v = v / 100.0
        return min(1.0, max(0.0, v))
    except ValueError:
        return 0.0


def _parse_coherence(value: Any) -> float:
    """Extract a leading percentage from a coherence rule into a 0.0-1.0 float.

    The cadence profiles express ``sentence_coherence`` as a human-readable string
    like ``"35%"`` or ``"35% -- thoughts dissolve"``; this pulls the leading
    percent number off the front and converts it to a fraction. Anything without
    a leading ``NN%`` is treated as fully coherent (1.0), which means "no
    stuttering" downstream in ``_truncate_sentences``.

    Called by ``CadencePostProcessor.process`` when blending the most-destructive
    coherence across profiles; a pure helper with no side effects.

    Args:
        value (Any): A coherence descriptor, typically a string starting with a
            percentage.

    Returns:
        float: The leading percentage as a 0.0-1.0 fraction, or 1.0 when no
        leading percentage is present.
    """
    s = str(value).strip()
    m = re.match(r"(\d+)%", s)
    if m:
        return int(m.group(1)) / 100.0
    return 1.0


def _detect_caps_mode(value: Any) -> str:
    """Map a free-text capitalization description to a ``_mangle_caps`` mode.

    Translates the prose ``capitalization`` field of a cadence profile into one
    of the discrete modes ``_mangle_caps`` understands by keyword-matching the
    lowercased text: phrases like "gone"/"lowercase" select ``lower``,
    "random"/"sporadic" select ``random``, and "caps lock"/"shout"/"all caps"
    select ``shout``; anything unrecognised falls back to ``none``.

    Called by ``CadencePostProcessor.process`` while tallying per-profile caps
    votes; a pure helper with no side effects.

    Args:
        value (Any): The capitalization descriptor from a cadence profile,
            stringified before matching.

    Returns:
        str: One of ``lower``, ``random``, ``shout``, or ``none``.
    """
    s = str(value).lower()
    if "gone" in s or "none" in s or "no energy" in s or "lowercase" in s:
        return "lower"
    if "random" in s or "sporadic" in s or "occasional" in s:
        return "random"
    if "caps lock" in s or "shout" in s or "all caps" in s or "yelling" in s:
        return "shout"
    return "none"


# =====================================================================
# Main processor class
# =====================================================================



[docs]
class CadencePostProcessor:
    """Pure code text seasoning for supraphysiological cadence states.

    No LLM calls. No API costs. No context corruption.
    Mechanical adjustments stay subtle so structure and markdown survive.
    """


[docs]
    @staticmethod
    def process(
        text: str,
        cadence_profile: Dict[str, Any],
    ) -> str:
        """Apply cadence degradation to clean LLM output.

        Parameters
        ----------
        text:
            Clean text from the main LLM.
        cadence_profile:
            Dict with keys: state, rules, voice_sample, force,
            intensity_desc.

        Returns
        -------
        Degraded text matching the cadence profile.
        """
        if not text or not cadence_profile:
            return text

        rules = cadence_profile.get("rules", {})
        force = cadence_profile.get("force", "should significantly")
        state = cadence_profile.get("state", "unknown")

        # 💀🔥 The limbic system gates cadence_refinement_profile emission
        # at score >= 0.65, so we only receive "MUST" or "should significantly"
        # here. If somehow "may subtly" or "should lightly" sneak through
        # (e.g. channel_heartbeat backward compat), skip them.
        if force not in ("MUST", "should significantly"):
            logger.debug(
                "Cadence refiner skipping force=%s state=%s " "(below drug-level gate)",
                force,
                state,
            )
            return text

        # -- Protect Star's status header from degradation --
        # Header pattern: [`model-name` :: emojis :: status :: `toolcall`]
        # Appears as the first line of the response
        header = ""
        body = text
        header_match = re.match(
            r"(\[`[^`]+`\s*::.*?\])\s*\n?",
            text,
        )
        if header_match:
            header = header_match.group(0)
            body = text[header_match.end() :]

        # -- Intensity multiplier based on force level --
        # Only 2 tiers reach here (gated by limbic system at score >= 0.65)
        if force == "MUST":
            intensity = 0.10
        else:  # "should significantly"
            intensity = 0.07

        # -- Blend rules from all matching profiles -------------------------
        # When multiple cadence profiles match (e.g. meth + xanax + PCP),
        # blend their degradation rates proportionally by activation score.
        blend_profiles = cadence_profile.get("blend_profiles", [])
        if not blend_profiles:
            # Fallback: single-profile mode (backwards compat)
            blend_profiles = [
                {
                    "state": state,
                    "rules": rules,
                    "activation_score": cadence_profile.get("activation_score", 1.0),
                    "emoji_sigils": cadence_profile.get("emoji_sigils", []),
                }
            ]

        total_score = sum(p["activation_score"] for p in blend_profiles) or 1.0

        # -- Parse primary cadence rules (weighted blend) --
        typo_rate = 0.0
        coherence = 1.0
        caps_mode = "none"
        vowel_stretch = False

        # -- Blendable float rates --
        vowel_drop_rate = 0.0
        consonant_drop_rate = 0.0
        space_inject_rate = 0.0
        linebreak_rate = 0.0
        ellipsis_rate = 0.0
        noun_plural_rate = 0.0
        word_shuffle_rate = 0.0
        word_repeat_rate = 0.0
        word_salad_rate = 0.0
        trailing_punct_rate = 0.0
        mega_vowel_stretch = False
        state_emoji_sigils: list[str] = []  # 💀🔥 per-state emoji sigils
        mid_punct_rate = 0.0

        # -- Caps mode voting (highest-weighted state wins) --
        caps_votes = {}  # caps_mode -> cumulative weight

        for bp in blend_profiles:
            w = bp["activation_score"] / total_score
            bp_rules = bp.get("rules", {})
            bp_state = bp.get("state", "")

            # Blend YAML-defined rules
            typo_rate += _parse_percentage(bp_rules.get("typo_rate", 0)) * w
            coherence_val = _parse_coherence(bp_rules.get("sentence_coherence", "100%"))
            coherence = min(coherence, coherence_val)  # most destructive wins
            bp_caps = _detect_caps_mode(bp_rules.get("capitalization", ""))
            if bp_caps != "none":
                caps_votes[bp_caps] = caps_votes.get(bp_caps, 0) + w

            if bp_rules.get("vowel_stretch", False):
                vowel_stretch = True

            # Blend punctuation hints
            punct = str(bp_rules.get("punctuation", "")).lower()
            if "ellips" in punct or "trailing" in punct or "fades" in punct:
                ellipsis_rate += 0.5 * intensity * w

            # -- State-specific degradation (weighted by profile score) --
            if bp_state in ("barred_out",):
                typo_rate = max(typo_rate, 0.20 * w) * intensity
                space_inject_rate += 0.06 * intensity * w
                consonant_drop_rate += 0.04 * intensity * w
                if bp_caps == "none":
                    caps_votes["lower"] = caps_votes.get("lower", 0) + w

            elif bp_state in ("drunk",):
                typo_rate = max(typo_rate, 0.15 * w) * intensity
                vowel_stretch = True
                space_inject_rate += 0.04 * intensity * w
                if bp_caps == "none":
                    caps_votes["random"] = caps_votes.get("random", 0) + w

            elif bp_state in ("tweaking", "coked_up"):
                typo_rate = max(typo_rate, 0.10 * w) * intensity
                if bp_caps == "none":
                    caps_votes["shout"] = caps_votes.get("shout", 0) + w

            elif bp_state in ("nodding",):
                vowel_drop_rate += 0.08 * intensity * w
                linebreak_rate += 0.02 * intensity * w
                ellipsis_rate = max(ellipsis_rate, 0.6 * intensity * w)

            elif bp_state in ("k_hole",):
                space_inject_rate += 0.08 * intensity * w
                consonant_drop_rate += 0.05 * intensity * w
                linebreak_rate += 0.03 * intensity * w
                mid_punct_rate += 0.35 * intensity * w
                mega_vowel_stretch = True
                word_shuffle_rate += 0.15 * intensity * w
                noun_plural_rate += 0.30 * intensity * w
                coherence = 1.0  # NO truncation for k-hole

            elif bp_state in ("dissociated_light",):
                space_inject_rate += 0.04 * intensity * w
                consonant_drop_rate += 0.03 * intensity * w
                noun_plural_rate += 0.15 * intensity * w

            elif bp_state in ("psychosis_paranoid", "psychosis_manic"):
                typo_rate = max(typo_rate, 0.08 * w) * intensity
                word_shuffle_rate += 0.20 * intensity * w
                word_salad_rate += 0.25 * intensity * w
                word_repeat_rate += 0.36 * intensity * w
                trailing_punct_rate += 0.40 * intensity * w
                if bp_caps == "none":
                    caps_votes["random"] = caps_votes.get("random", 0) + w

            elif bp_state in ("acid", "shrooms", "dmt_breakthrough"):
                vowel_stretch = True
                vowel_drop_rate += 0.03 * intensity * w
                # 💀 emoji sigils from YAML profile (or fallback to psychedelic set)
                _yaml_sigils = bp.get("emoji_sigils", [])
                if _yaml_sigils:
                    state_emoji_sigils.extend(_yaml_sigils)
                else:
                    state_emoji_sigils.extend(_PSYCHEDELIC_EMOJIS)

            elif bp_state in ("rolling",):
                vowel_stretch = True
                typo_rate = max(typo_rate, 0.05 * w) * intensity

            elif bp_state in ("stoned", "stoned_heavy"):
                vowel_stretch = True
                ellipsis_rate = max(ellipsis_rate, 0.3 * w) * intensity
                if bp_state == "stoned_heavy":
                    typo_rate = max(typo_rate, 0.08 * w) * intensity
                    space_inject_rate += 0.02 * intensity * w

            # 🌿 Sativa stoned: cerebral, tangential, creative
            elif bp_state in ("stoned_sativa",):
                vowel_stretch = True
                typo_rate = max(typo_rate, 0.03 * w) * intensity
                word_shuffle_rate += 0.05 * intensity * w  # tangential
                if bp_caps == "none":
                    caps_votes["random"] = caps_votes.get("random", 0) + w * 0.3

            # 🌿 Indica stoned: heavy, slow, melting
            elif bp_state in ("stoned_indica",):
                vowel_stretch = True
                ellipsis_rate = max(ellipsis_rate, 0.5 * w) * intensity
                linebreak_rate += 0.02 * intensity * w
                consonant_drop_rate += 0.03 * intensity * w
                typo_rate = max(typo_rate, 0.06 * w) * intensity
                space_inject_rate += 0.02 * intensity * w
                if bp_caps == "none":
                    caps_votes["lower"] = caps_votes.get("lower", 0) + w

            # 💀🔥 Pull emoji_sigils from any state's profile data
            # (acid/shrooms/dmt handled above with psychedelic fallback)
            if bp_state not in ("acid", "shrooms", "dmt_breakthrough"):
                _yaml_sigils = bp.get("emoji_sigils", [])
                if _yaml_sigils:
                    state_emoji_sigils.extend(_yaml_sigils)

        # -- Global softening (directive + LLM carry tone; code is texture) --
        _structural = 0.48
        _texture = 0.58
        _typo_scale = 0.70
        word_shuffle_rate *= _structural
        word_salad_rate *= _structural
        word_repeat_rate *= _structural
        trailing_punct_rate *= _structural
        mid_punct_rate *= _structural
        noun_plural_rate *= 0.70
        vowel_drop_rate *= 0.40
        consonant_drop_rate *= 0.40
        space_inject_rate *= _texture
        linebreak_rate *= _texture
        ellipsis_rate *= 0.75
        typo_rate = min(0.12, typo_rate * _typo_scale)

        # -- Resolve caps mode from votes (highest cumulative weight wins) --
        if caps_votes:
            caps_mode = max(caps_votes, key=caps_votes.get)

        # -- Protect code blocks, markdown, and CoT/internal-processing
        # XML-tag blocks -- and markdown links, inline code, URLs so lists and
        # formatting are not corrupted by character-level transforms.
        segments = _CADENCE_PROTECTED_SEGMENTS.split(body)
        processed = []

        for i, segment in enumerate(segments):
            # ``re.split`` with a capturing group yields prose at even
            # indices and protected matches at odd indices.
            if i % 2 == 1:
                processed.append(segment)
                continue

            s = segment

            # -- Apply transforms in order --

            # 1. Word-doubling stutter (structural coherence loss)
            s = _truncate_sentences(s, coherence)

            # 2. Word shuffling (k-hole sentence scrambling)
            s = _shuffle_words(s, word_shuffle_rate)

            # 3. Noun pluralization (k-hole/dissociated)
            s = _pluralize_nouns(s, noun_plural_rate)

            # 4. Word salad (psychosis compound mangling)
            s = _word_salad(s, word_salad_rate)

            # 5. Word repetition (psychosis stutter)
            s = _repeat_words(s, word_repeat_rate)

            # 6. Trailing ellipses
            s = _add_trailing_ellipses(s, ellipsis_rate)

            # 7. Trailing punctuation bursts (psychosis)
            s = _trailing_punctuation(s, trailing_punct_rate)

            # 8. Mid-paragraph punctuation (k-hole)
            s = _add_mid_punctuation(s, mid_punct_rate)

            # 9. Vowel/consonant drops
            s = _drop_vowels(s, vowel_drop_rate)
            s = _drop_consonants(s, consonant_drop_rate)

            # 10. Typo injection
            s = _apply_typos(s, typo_rate)

            # 11. Mid-word spaces
            s = _inject_spaces(s, space_inject_rate)

            # 12. Mid-word linebreaks
            s = _inject_linebreaks(s, linebreak_rate)

            # 13. Vowel stretching (casual)
            if vowel_stretch:
                s = _stretch_vowels(s, intensity * 0.38)

            # 14. Mega vowel stretching at sentence ends (k-hole)
            if mega_vowel_stretch:
                s = _stretch_sentence_final_vowels(s, intensity)

            # 15. State emoji sigils before linebreaks
            if state_emoji_sigils:
                s = _psychedelic_emoji_linebreaks(s, emojis=state_emoji_sigils)

            # 16. Capitalization last (so it applies to typo'd text)
            s = _mangle_caps(s, caps_mode)

            processed.append(s)

        result = "".join(processed)

        logger.info(
            "Cadence post-processing applied: state=%s, force=%s, "
            "typo=%.0f%%, coherence=%.0f%%, caps=%s",
            state,
            force,
            typo_rate * 100,
            coherence * 100,
            caps_mode,
        )

        return header + result