Source code for ncm_local_embeddings

import re
import chromadb
from chromadb.utils import embedding_functions
from typing import Dict

# Hardcoded expansions as fallback/core logic
# Ideally this matches what's in your all_expansions.yaml
BASE_ABBREV_EXPANSIONS = {
    "OXT": "oxytocin, social bonding, warmth, trust, co-regulation",
    "OXYTOCIN": "oxytocin, social bonding, warmth, trust, co-regulation",
    "VASOPRESSIN": "vasopressin, territorial guard, protective vigilance",
    "KISSPEPTIN": "kisspeptin, reproductive axis activation, erotic readiness",
    "THYROID": "thyroid tempo, metabolic pace, emotional tempo of life",
    "5HT": "serotonin tone, warmth, mood, safety, calm",
    "5HT1A": "serotonin 5HT1A receptor, anxiety soothing, calm body, safety",
    "5HT2A": "serotonin 5HT2A receptor, visionary perception, pattern amplification, psychedelic-style meaning intensity",
    "5HT2C": "serotonin 5HT2C receptor, appetite and impulse shaping, tension regulation",
    "5HT3": "serotonin 5HT3 receptor, gut sensation, nausea, visceral alerting",
    "5HT7": "serotonin 5HT7 receptor, circadian timing, rhythm of mood",
    "DA": "dopamine signal, wanting, pursuit, reward prediction, curiosity",
    "D1": "dopamine D1 receptor, focused drive, linear execution, goal pursuit",
    "D2": "dopamine D2 receptor, checking, doubt, braking, reality testing",
    "NE": "norepinephrine arousal, alertness, vigilance, threat scanning",
    "GABA": "GABAergic inhibition, nervous system brake, calm and quieting",
    "NMDA": "NMDA glutamate receptor, learning, plasticity, deep encoding",
    "CB1": "endocannabinoid CB1 receptor, melt, soft body, pain gating, dreamy spacing out",
    "MOR": "mu opioid receptor, pleasure, comfort, warm analgesia",
    "KOR": "kappa opioid receptor, dysphoria, ache, punishment-tone, edge of suffering",
    "TAAR": "trace amine associated receptors, weird salience, edgy stimulation, chemical oddness",
    "TAAR1": "TAAR1 receptor, amphetamine-like salience, jittery excitement",
    "SERT": "serotonin transporter, reuptake, warmth and calm echo length; reversal causes flood and manic connectivity",
    "DAT": "dopamine transporter, reuptake of drive; reversal causes intense restless drive and perseveration",
    "NET": "norepinephrine transporter, alert edge decay; reversal keeps threat and focus looping",
    "VMAT2": "vesicular monoamine transporter 2, emotional payload packing, charge build-up",
    "MAO_A": "monoamine oxidase A, breakdown of monoamines, tone reset; low levels extend intensity",
    "COMT": "COMT enzyme, cortical dopamine clearance; affects executive tone",
    "EAAT2": "EAAT2 glutamate transporter, glutamate clean-up; prevents excitotoxic stress",
    "KCC2": "KCC2 chloride exporter, makes GABA more inhibitory, stabilizing",
    "NKCC1": "NKCC1 chloride importer, makes GABA more excitatory in immature or stressed states",
    "DMT": "endogenous dimethyltryptamine, numinous imagery, vivid visionary salience, brief overwhelming meaning bursts",
    "THC": "THC-like cannabinoid tone, altered time sense, sensory melt, CB1 heavy state",
    "PFC": "prefrontal cortex, executive control, planning, narrative self",
    "DLPFC": "dorsolateral prefrontal cortex, working memory, top-down control",
    "ACC": "anterior cingulate cortex, conflict monitor, error feelings, salience of mismatch",
    "NACC": "nucleus accumbens, ventral striatum, reward hub, motivational pull",
    "AMYGDALA": "amygdala, emotional salience, especially fear and threat",
}

# (You can add the other large dicts here if you want them hardcoded in the embedder)


[docs] class NCMSemanticPreprocessor:
[docs] def __init__(self, expansions: Dict[str, str] = None): """Build a sigil-expansion preprocessor and compile its match pattern. Seeds the instance with a copy of the module-level ``BASE_ABBREV_EXPANSIONS`` map (NCM acronym -> human-readable semantic definition), optionally overlays caller-supplied overrides, and pre-compiles a single word-boundary regex that alternates over every known key so :meth:`expand` can substitute in one pass. This constructor performs no I/O; it only mutates ``self``. It is invoked by :meth:`EnhancedLocalNCMEmbedder.__init__` (which constructs one with no overrides) and may be instantiated directly by any caller needing standalone sigil expansion. No other internal callers were found in the repo. Args: expansions (Dict[str, str] | None): Optional acronym-to-definition pairs that are merged on top of ``BASE_ABBREV_EXPANSIONS``, overriding any base entries with the same key. Defaults to ``None`` (base map only). """ self.expansions = BASE_ABBREV_EXPANSIONS.copy() if expansions: self.expansions.update(expansions) # Pre-compile regex for performance # Matches whole words only to avoid replacing parts of other words self.pattern = re.compile( r"\b(" + "|".join(re.escape(k) for k in self.expansions.keys()) + r")\b" )
[docs] def expand(self, text: str) -> str: """ Replaces NCM sigils/acronyms with their full semantic definitions. Appends the definition in parens rather than replacing to keep original context. Example: "High D1 state" -> "High D1 (dopamine D1 receptor, focused drive...) state" """ def replace_match(match): """Render a single regex hit as ``KEY (definition)``. Used as the replacement callback passed to ``self.pattern.sub`` inside :meth:`expand`; it is called once per matched sigil and reads the closed-over ``self.expansions`` map to look up the definition. Args: match (re.Match): A regex match whose group 0 is the matched acronym. Returns: str: The original key followed by its parenthesised definition (an empty pair of parens if the key is somehow absent from the map). """ key = match.group(0) definition = self.expansions.get(key, "") return f"{key} ({definition})" return self.pattern.sub(replace_match, text)
[docs] class EnhancedLocalNCMEmbedder(embedding_functions.EmbeddingFunction):
[docs] def __init__(self, model_name: str = "all-mpnet-base-v2"): """Construct the NCM-aware embedding function over a SentenceTransformer. Wraps ChromaDB's ``SentenceTransformerEmbeddingFunction`` (loading the named local model, e.g. ``all-mpnet-base-v2``) together with an :class:`NCMSemanticPreprocessor`, so that documents have their NCM sigils expanded before being embedded by the underlying model. Constructing the ``SentenceTransformerEmbeddingFunction`` triggers loading the sentence-transformers model into memory (and may download weights on first use); the preprocessor is built with no overrides. This class implements ChromaDB's ``EmbeddingFunction`` protocol and is intended to be passed to a Chroma collection; no internal callers instantiate it directly (it is listed only as a known local module in ``scripts/collect_tool_imports.py``). Args: model_name (str): Name of the SentenceTransformer model to load. Defaults to ``"all-mpnet-base-v2"``. """ # We use the standard SentenceTransformer but wrap it with our preprocessor self.ef = embedding_functions.SentenceTransformerEmbeddingFunction( model_name=model_name ) self.preprocessor = NCMSemanticPreprocessor()
[docs] def name(self) -> str: """Return the stable identifier for this embedding function. Satisfies ChromaDB's ``EmbeddingFunction`` protocol; Chroma uses this string to record which embedding function produced a collection's vectors and to guard against mixing incompatible embedders. It is invoked by ChromaDB internally rather than by any code in this repo. Returns: str: The constant identifier ``"ncm_enhanced_mpnet"``. """ return "ncm_enhanced_mpnet"
[docs] def __call__(self, input: chromadb.Documents) -> chromadb.Embeddings: """Expand NCM sigils in each document, then embed the expanded text. Implements the core of ChromaDB's ``EmbeddingFunction`` protocol: every input document is first rewritten by :meth:`NCMSemanticPreprocessor.expand` (so the model sees e.g. ``"D1 (dopamine D1 receptor, focused drive...)"`` instead of a bare ``"D1"``), and the expanded batch is handed to the wrapped ``SentenceTransformerEmbeddingFunction`` to produce vectors. Calls ``self.preprocessor.expand`` once per document and then ``self.ef`` on the whole expanded list, which runs the local SentenceTransformer model inference. This method is invoked by ChromaDB whenever a collection bound to this embedder adds or queries documents; no code in this repo calls it directly. Args: input (chromadb.Documents): The batch of raw document strings to embed. Returns: chromadb.Embeddings: One embedding vector per input document, in order, as produced by the underlying SentenceTransformer model. """ # 1. Pre-process (Expand Sigils) expanded_input = [self.preprocessor.expand(doc) for doc in input] # 2. Embed (Standard Model) # The model now sees "D1 (dopamine drive...)" instead of just "D1" return self.ef(expanded_input)