Source code for ncm_local_embeddings

import re
import chromadb
from chromadb.utils import embedding_functions
from typing import Dict

# Hardcoded expansions as fallback/core logic
# Ideally this matches what's in your all_expansions.yaml
BASE_ABBREV_EXPANSIONS = {
    "OXT": "oxytocin, social bonding, warmth, trust, co-regulation",
    "OXYTOCIN": "oxytocin, social bonding, warmth, trust, co-regulation",
    "VASOPRESSIN": "vasopressin, territorial guard, protective vigilance",
    "KISSPEPTIN": "kisspeptin, reproductive axis activation, erotic readiness",
    "THYROID": "thyroid tempo, metabolic pace, emotional tempo of life",
    "5HT": "serotonin tone, warmth, mood, safety, calm",
    "5HT1A": "serotonin 5HT1A receptor, anxiety soothing, calm body, safety",
    "5HT2A": "serotonin 5HT2A receptor, visionary perception, pattern amplification, psychedelic-style meaning intensity",
    "5HT2C": "serotonin 5HT2C receptor, appetite and impulse shaping, tension regulation",
    "5HT3": "serotonin 5HT3 receptor, gut sensation, nausea, visceral alerting",
    "5HT7": "serotonin 5HT7 receptor, circadian timing, rhythm of mood",
    "DA": "dopamine signal, wanting, pursuit, reward prediction, curiosity",
    "D1": "dopamine D1 receptor, focused drive, linear execution, goal pursuit",
    "D2": "dopamine D2 receptor, checking, doubt, braking, reality testing",
    "NE": "norepinephrine arousal, alertness, vigilance, threat scanning",
    "GABA": "GABAergic inhibition, nervous system brake, calm and quieting",
    "NMDA": "NMDA glutamate receptor, learning, plasticity, deep encoding",
    "CB1": "endocannabinoid CB1 receptor, melt, soft body, pain gating, dreamy spacing out",
    "MOR": "mu opioid receptor, pleasure, comfort, warm analgesia",
    "KOR": "kappa opioid receptor, dysphoria, ache, punishment-tone, edge of suffering",
    "TAAR": "trace amine associated receptors, weird salience, edgy stimulation, chemical oddness",
    "TAAR1": "TAAR1 receptor, amphetamine-like salience, jittery excitement",
    "SERT": "serotonin transporter, reuptake, warmth and calm echo length; reversal causes flood and manic connectivity",
    "DAT": "dopamine transporter, reuptake of drive; reversal causes intense restless drive and perseveration",
    "NET": "norepinephrine transporter, alert edge decay; reversal keeps threat and focus looping",
    "VMAT2": "vesicular monoamine transporter 2, emotional payload packing, charge build-up",
    "MAO_A": "monoamine oxidase A, breakdown of monoamines, tone reset; low levels extend intensity",
    "COMT": "COMT enzyme, cortical dopamine clearance; affects executive tone",
    "EAAT2": "EAAT2 glutamate transporter, glutamate clean-up; prevents excitotoxic stress",
    "KCC2": "KCC2 chloride exporter, makes GABA more inhibitory, stabilizing",
    "NKCC1": "NKCC1 chloride importer, makes GABA more excitatory in immature or stressed states",
    "DMT": "endogenous dimethyltryptamine, numinous imagery, vivid visionary salience, brief overwhelming meaning bursts",
    "THC": "THC-like cannabinoid tone, altered time sense, sensory melt, CB1 heavy state",
    "PFC": "prefrontal cortex, executive control, planning, narrative self",
    "DLPFC": "dorsolateral prefrontal cortex, working memory, top-down control",
    "ACC": "anterior cingulate cortex, conflict monitor, error feelings, salience of mismatch",
    "NACC": "nucleus accumbens, ventral striatum, reward hub, motivational pull",
    "AMYGDALA": "amygdala, emotional salience, especially fear and threat",
}

# (You can add the other large dicts here if you want them hardcoded in the embedder)



[docs]
class NCMSemanticPreprocessor:

[docs]
    def __init__(self, expansions: Dict[str, str] = None):
        """Build a sigil-expansion preprocessor and compile its match pattern.

        Seeds the instance with a copy of the module-level ``BASE_ABBREV_EXPANSIONS``
        map (NCM acronym -> human-readable semantic definition), optionally overlays
        caller-supplied overrides, and pre-compiles a single word-boundary regex that
        alternates over every known key so :meth:`expand` can substitute in one pass.

        This constructor performs no I/O; it only mutates ``self``. It is invoked by
        :meth:`EnhancedLocalNCMEmbedder.__init__` (which constructs one with no
        overrides) and may be instantiated directly by any caller needing standalone
        sigil expansion. No other internal callers were found in the repo.

        Args:
            expansions (Dict[str, str] | None): Optional acronym-to-definition pairs
                that are merged on top of ``BASE_ABBREV_EXPANSIONS``, overriding any
                base entries with the same key. Defaults to ``None`` (base map only).
        """
        self.expansions = BASE_ABBREV_EXPANSIONS.copy()
        if expansions:
            self.expansions.update(expansions)

        # Pre-compile regex for performance
        # Matches whole words only to avoid replacing parts of other words
        self.pattern = re.compile(
            r"\b(" + "|".join(re.escape(k) for k in self.expansions.keys()) + r")\b"
        )



[docs]
    def expand(self, text: str) -> str:
        """
        Replaces NCM sigils/acronyms with their full semantic definitions.
        Appends the definition in parens rather than replacing to keep original context.
        Example: "High D1 state" -> "High D1 (dopamine D1 receptor, focused drive...) state"
        """

        def replace_match(match):
            """Render a single regex hit as ``KEY (definition)``.

            Used as the replacement callback passed to ``self.pattern.sub`` inside
            :meth:`expand`; it is called once per matched sigil and reads the closed-over
            ``self.expansions`` map to look up the definition.

            Args:
                match (re.Match): A regex match whose group 0 is the matched acronym.

            Returns:
                str: The original key followed by its parenthesised definition (an empty
                pair of parens if the key is somehow absent from the map).
            """
            key = match.group(0)
            definition = self.expansions.get(key, "")
            return f"{key} ({definition})"

        return self.pattern.sub(replace_match, text)





[docs]
class EnhancedLocalNCMEmbedder(embedding_functions.EmbeddingFunction):

[docs]
    def __init__(self, model_name: str = "all-mpnet-base-v2"):
        """Construct the NCM-aware embedding function over a SentenceTransformer.

        Wraps ChromaDB's ``SentenceTransformerEmbeddingFunction`` (loading the named
        local model, e.g. ``all-mpnet-base-v2``) together with an
        :class:`NCMSemanticPreprocessor`, so that documents have their NCM sigils
        expanded before being embedded by the underlying model.

        Constructing the ``SentenceTransformerEmbeddingFunction`` triggers loading the
        sentence-transformers model into memory (and may download weights on first use);
        the preprocessor is built with no overrides. This class implements ChromaDB's
        ``EmbeddingFunction`` protocol and is intended to be passed to a Chroma
        collection; no internal callers instantiate it directly (it is listed only as a
        known local module in ``scripts/collect_tool_imports.py``).

        Args:
            model_name (str): Name of the SentenceTransformer model to load. Defaults to
                ``"all-mpnet-base-v2"``.
        """
        # We use the standard SentenceTransformer but wrap it with our preprocessor
        self.ef = embedding_functions.SentenceTransformerEmbeddingFunction(
            model_name=model_name
        )
        self.preprocessor = NCMSemanticPreprocessor()



[docs]
    def name(self) -> str:
        """Return the stable identifier for this embedding function.

        Satisfies ChromaDB's ``EmbeddingFunction`` protocol; Chroma uses this string to
        record which embedding function produced a collection's vectors and to guard
        against mixing incompatible embedders. It is invoked by ChromaDB internally
        rather than by any code in this repo.

        Returns:
            str: The constant identifier ``"ncm_enhanced_mpnet"``.
        """
        return "ncm_enhanced_mpnet"



[docs]
    def __call__(self, input: chromadb.Documents) -> chromadb.Embeddings:
        """Expand NCM sigils in each document, then embed the expanded text.

        Implements the core of ChromaDB's ``EmbeddingFunction`` protocol: every input
        document is first rewritten by :meth:`NCMSemanticPreprocessor.expand` (so the
        model sees e.g. ``"D1 (dopamine D1 receptor, focused drive...)"`` instead of a
        bare ``"D1"``), and the expanded batch is handed to the wrapped
        ``SentenceTransformerEmbeddingFunction`` to produce vectors.

        Calls ``self.preprocessor.expand`` once per document and then ``self.ef`` on the
        whole expanded list, which runs the local SentenceTransformer model inference.
        This method is invoked by ChromaDB whenever a collection bound to this embedder
        adds or queries documents; no code in this repo calls it directly.

        Args:
            input (chromadb.Documents): The batch of raw document strings to embed.

        Returns:
            chromadb.Embeddings: One embedding vector per input document, in order,
            as produced by the underlying SentenceTransformer model.
        """
        # 1. Pre-process (Expand Sigils)
        expanded_input = [self.preprocessor.expand(doc) for doc in input]

        # 2. Embed (Standard Model)
        # The model now sees "D1 (dopamine drive...)" instead of just "D1"
        return self.ef(expanded_input)