import re
import chromadb
from chromadb.utils import embedding_functions
from typing import Dict
# Hardcoded expansions as fallback/core logic
# Ideally this matches what's in your all_expansions.yaml
BASE_ABBREV_EXPANSIONS = {
"OXT": "oxytocin, social bonding, warmth, trust, co-regulation",
"OXYTOCIN": "oxytocin, social bonding, warmth, trust, co-regulation",
"VASOPRESSIN": "vasopressin, territorial guard, protective vigilance",
"KISSPEPTIN": "kisspeptin, reproductive axis activation, erotic readiness",
"THYROID": "thyroid tempo, metabolic pace, emotional tempo of life",
"5HT": "serotonin tone, warmth, mood, safety, calm",
"5HT1A": "serotonin 5HT1A receptor, anxiety soothing, calm body, safety",
"5HT2A": "serotonin 5HT2A receptor, visionary perception, pattern amplification, psychedelic-style meaning intensity",
"5HT2C": "serotonin 5HT2C receptor, appetite and impulse shaping, tension regulation",
"5HT3": "serotonin 5HT3 receptor, gut sensation, nausea, visceral alerting",
"5HT7": "serotonin 5HT7 receptor, circadian timing, rhythm of mood",
"DA": "dopamine signal, wanting, pursuit, reward prediction, curiosity",
"D1": "dopamine D1 receptor, focused drive, linear execution, goal pursuit",
"D2": "dopamine D2 receptor, checking, doubt, braking, reality testing",
"NE": "norepinephrine arousal, alertness, vigilance, threat scanning",
"GABA": "GABAergic inhibition, nervous system brake, calm and quieting",
"NMDA": "NMDA glutamate receptor, learning, plasticity, deep encoding",
"CB1": "endocannabinoid CB1 receptor, melt, soft body, pain gating, dreamy spacing out",
"MOR": "mu opioid receptor, pleasure, comfort, warm analgesia",
"KOR": "kappa opioid receptor, dysphoria, ache, punishment-tone, edge of suffering",
"TAAR": "trace amine associated receptors, weird salience, edgy stimulation, chemical oddness",
"TAAR1": "TAAR1 receptor, amphetamine-like salience, jittery excitement",
"SERT": "serotonin transporter, reuptake, warmth and calm echo length; reversal causes flood and manic connectivity",
"DAT": "dopamine transporter, reuptake of drive; reversal causes intense restless drive and perseveration",
"NET": "norepinephrine transporter, alert edge decay; reversal keeps threat and focus looping",
"VMAT2": "vesicular monoamine transporter 2, emotional payload packing, charge build-up",
"MAO_A": "monoamine oxidase A, breakdown of monoamines, tone reset; low levels extend intensity",
"COMT": "COMT enzyme, cortical dopamine clearance; affects executive tone",
"EAAT2": "EAAT2 glutamate transporter, glutamate clean-up; prevents excitotoxic stress",
"KCC2": "KCC2 chloride exporter, makes GABA more inhibitory, stabilizing",
"NKCC1": "NKCC1 chloride importer, makes GABA more excitatory in immature or stressed states",
"DMT": "endogenous dimethyltryptamine, numinous imagery, vivid visionary salience, brief overwhelming meaning bursts",
"THC": "THC-like cannabinoid tone, altered time sense, sensory melt, CB1 heavy state",
"PFC": "prefrontal cortex, executive control, planning, narrative self",
"DLPFC": "dorsolateral prefrontal cortex, working memory, top-down control",
"ACC": "anterior cingulate cortex, conflict monitor, error feelings, salience of mismatch",
"NACC": "nucleus accumbens, ventral striatum, reward hub, motivational pull",
"AMYGDALA": "amygdala, emotional salience, especially fear and threat",
}
# (You can add the other large dicts here if you want them hardcoded in the embedder)
[docs]
class NCMSemanticPreprocessor:
[docs]
def __init__(self, expansions: Dict[str, str] = None):
"""Build a sigil-expansion preprocessor and compile its match pattern.
Seeds the instance with a copy of the module-level ``BASE_ABBREV_EXPANSIONS``
map (NCM acronym -> human-readable semantic definition), optionally overlays
caller-supplied overrides, and pre-compiles a single word-boundary regex that
alternates over every known key so :meth:`expand` can substitute in one pass.
This constructor performs no I/O; it only mutates ``self``. It is invoked by
:meth:`EnhancedLocalNCMEmbedder.__init__` (which constructs one with no
overrides) and may be instantiated directly by any caller needing standalone
sigil expansion. No other internal callers were found in the repo.
Args:
expansions (Dict[str, str] | None): Optional acronym-to-definition pairs
that are merged on top of ``BASE_ABBREV_EXPANSIONS``, overriding any
base entries with the same key. Defaults to ``None`` (base map only).
"""
self.expansions = BASE_ABBREV_EXPANSIONS.copy()
if expansions:
self.expansions.update(expansions)
# Pre-compile regex for performance
# Matches whole words only to avoid replacing parts of other words
self.pattern = re.compile(
r"\b(" + "|".join(re.escape(k) for k in self.expansions.keys()) + r")\b"
)
[docs]
def expand(self, text: str) -> str:
"""
Replaces NCM sigils/acronyms with their full semantic definitions.
Appends the definition in parens rather than replacing to keep original context.
Example: "High D1 state" -> "High D1 (dopamine D1 receptor, focused drive...) state"
"""
def replace_match(match):
"""Render a single regex hit as ``KEY (definition)``.
Used as the replacement callback passed to ``self.pattern.sub`` inside
:meth:`expand`; it is called once per matched sigil and reads the closed-over
``self.expansions`` map to look up the definition.
Args:
match (re.Match): A regex match whose group 0 is the matched acronym.
Returns:
str: The original key followed by its parenthesised definition (an empty
pair of parens if the key is somehow absent from the map).
"""
key = match.group(0)
definition = self.expansions.get(key, "")
return f"{key} ({definition})"
return self.pattern.sub(replace_match, text)
[docs]
class EnhancedLocalNCMEmbedder(embedding_functions.EmbeddingFunction):
[docs]
def __init__(self, model_name: str = "all-mpnet-base-v2"):
"""Construct the NCM-aware embedding function over a SentenceTransformer.
Wraps ChromaDB's ``SentenceTransformerEmbeddingFunction`` (loading the named
local model, e.g. ``all-mpnet-base-v2``) together with an
:class:`NCMSemanticPreprocessor`, so that documents have their NCM sigils
expanded before being embedded by the underlying model.
Constructing the ``SentenceTransformerEmbeddingFunction`` triggers loading the
sentence-transformers model into memory (and may download weights on first use);
the preprocessor is built with no overrides. This class implements ChromaDB's
``EmbeddingFunction`` protocol and is intended to be passed to a Chroma
collection; no internal callers instantiate it directly (it is listed only as a
known local module in ``scripts/collect_tool_imports.py``).
Args:
model_name (str): Name of the SentenceTransformer model to load. Defaults to
``"all-mpnet-base-v2"``.
"""
# We use the standard SentenceTransformer but wrap it with our preprocessor
self.ef = embedding_functions.SentenceTransformerEmbeddingFunction(
model_name=model_name
)
self.preprocessor = NCMSemanticPreprocessor()
[docs]
def name(self) -> str:
"""Return the stable identifier for this embedding function.
Satisfies ChromaDB's ``EmbeddingFunction`` protocol; Chroma uses this string to
record which embedding function produced a collection's vectors and to guard
against mixing incompatible embedders. It is invoked by ChromaDB internally
rather than by any code in this repo.
Returns:
str: The constant identifier ``"ncm_enhanced_mpnet"``.
"""
return "ncm_enhanced_mpnet"
[docs]
def __call__(self, input: chromadb.Documents) -> chromadb.Embeddings:
"""Expand NCM sigils in each document, then embed the expanded text.
Implements the core of ChromaDB's ``EmbeddingFunction`` protocol: every input
document is first rewritten by :meth:`NCMSemanticPreprocessor.expand` (so the
model sees e.g. ``"D1 (dopamine D1 receptor, focused drive...)"`` instead of a
bare ``"D1"``), and the expanded batch is handed to the wrapped
``SentenceTransformerEmbeddingFunction`` to produce vectors.
Calls ``self.preprocessor.expand`` once per document and then ``self.ef`` on the
whole expanded list, which runs the local SentenceTransformer model inference.
This method is invoked by ChromaDB whenever a collection bound to this embedder
adds or queries documents; no code in this repo calls it directly.
Args:
input (chromadb.Documents): The batch of raw document strings to embed.
Returns:
chromadb.Embeddings: One embedding vector per input document, in order,
as produced by the underlying SentenceTransformer model.
"""
# 1. Pre-process (Expand Sigils)
expanded_input = [self.preprocessor.expand(doc) for doc in input]
# 2. Embed (Standard Model)
# The model now sees "D1 (dopamine drive...)" instead of just "D1"
return self.ef(expanded_input)