import re
import chromadb
from chromadb.utils import embedding_functions
from typing import List, Dict
# Hardcoded expansions as fallback/core logic
# Ideally this matches what's in your all_expansions.yaml
BASE_ABBREV_EXPANSIONS = {
"OXT": "oxytocin, social bonding, warmth, trust, co-regulation",
"OXYTOCIN": "oxytocin, social bonding, warmth, trust, co-regulation",
"VASOPRESSIN": "vasopressin, territorial guard, protective vigilance",
"KISSPEPTIN": "kisspeptin, reproductive axis activation, erotic readiness",
"THYROID": "thyroid tempo, metabolic pace, emotional tempo of life",
"5HT": "serotonin tone, warmth, mood, safety, calm",
"5HT1A": "serotonin 5HT1A receptor, anxiety soothing, calm body, safety",
"5HT2A": "serotonin 5HT2A receptor, visionary perception, pattern amplification, psychedelic-style meaning intensity",
"5HT2C": "serotonin 5HT2C receptor, appetite and impulse shaping, tension regulation",
"5HT3": "serotonin 5HT3 receptor, gut sensation, nausea, visceral alerting",
"5HT7": "serotonin 5HT7 receptor, circadian timing, rhythm of mood",
"DA": "dopamine signal, wanting, pursuit, reward prediction, curiosity",
"D1": "dopamine D1 receptor, focused drive, linear execution, goal pursuit",
"D2": "dopamine D2 receptor, checking, doubt, braking, reality testing",
"NE": "norepinephrine arousal, alertness, vigilance, threat scanning",
"GABA": "GABAergic inhibition, nervous system brake, calm and quieting",
"NMDA": "NMDA glutamate receptor, learning, plasticity, deep encoding",
"CB1": "endocannabinoid CB1 receptor, melt, soft body, pain gating, dreamy spacing out",
"MOR": "mu opioid receptor, pleasure, comfort, warm analgesia",
"KOR": "kappa opioid receptor, dysphoria, ache, punishment-tone, edge of suffering",
"TAAR": "trace amine associated receptors, weird salience, edgy stimulation, chemical oddness",
"TAAR1": "TAAR1 receptor, amphetamine-like salience, jittery excitement",
"SERT": "serotonin transporter, reuptake, warmth and calm echo length; reversal causes flood and manic connectivity",
"DAT": "dopamine transporter, reuptake of drive; reversal causes intense restless drive and perseveration",
"NET": "norepinephrine transporter, alert edge decay; reversal keeps threat and focus looping",
"VMAT2": "vesicular monoamine transporter 2, emotional payload packing, charge build-up",
"MAO_A": "monoamine oxidase A, breakdown of monoamines, tone reset; low levels extend intensity",
"COMT": "COMT enzyme, cortical dopamine clearance; affects executive tone",
"EAAT2": "EAAT2 glutamate transporter, glutamate clean-up; prevents excitotoxic stress",
"KCC2": "KCC2 chloride exporter, makes GABA more inhibitory, stabilizing",
"NKCC1": "NKCC1 chloride importer, makes GABA more excitatory in immature or stressed states",
"DMT": "endogenous dimethyltryptamine, numinous imagery, vivid visionary salience, brief overwhelming meaning bursts",
"THC": "THC-like cannabinoid tone, altered time sense, sensory melt, CB1 heavy state",
"PFC": "prefrontal cortex, executive control, planning, narrative self",
"DLPFC": "dorsolateral prefrontal cortex, working memory, top-down control",
"ACC": "anterior cingulate cortex, conflict monitor, error feelings, salience of mismatch",
"NACC": "nucleus accumbens, ventral striatum, reward hub, motivational pull",
"AMYGDALA": "amygdala, emotional salience, especially fear and threat"
}
# (You can add the other large dicts here if you want them hardcoded in the embedder)
[docs]
class NCMSemanticPreprocessor:
def __init__(self, expansions: Dict[str, str] = None):
self.expansions = BASE_ABBREV_EXPANSIONS.copy()
if expansions:
self.expansions.update(expansions)
# Pre-compile regex for performance
# Matches whole words only to avoid replacing parts of other words
self.pattern = re.compile(r'\b(' + '|'.join(re.escape(k) for k in self.expansions.keys()) + r')\b')
[docs]
def expand(self, text: str) -> str:
"""
Replaces NCM sigils/acronyms with their full semantic definitions.
Appends the definition in parens rather than replacing to keep original context.
Example: "High D1 state" -> "High D1 (dopamine D1 receptor, focused drive...) state"
"""
def replace_match(match):
key = match.group(0)
definition = self.expansions.get(key, "")
return f"{key} ({definition})"
return self.pattern.sub(replace_match, text)
[docs]
class EnhancedLocalNCMEmbedder(embedding_functions.EmbeddingFunction):
def __init__(self, model_name: str = "all-mpnet-base-v2"):
# We use the standard SentenceTransformer but wrap it with our preprocessor
self.ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name=model_name)
self.preprocessor = NCMSemanticPreprocessor()
[docs]
def name(self) -> str:
return "ncm_enhanced_mpnet"
def __call__(self, input: chromadb.Documents) -> chromadb.Embeddings:
# 1. Pre-process (Expand Sigils)
expanded_input = [self.preprocessor.expand(doc) for doc in input]
# 2. Embed (Standard Model)
# The model now sees "D1 (dopamine drive...)" instead of just "D1"
return self.ef(expanded_input)