Source code for ncm_local_embeddings

import re
import chromadb
from chromadb.utils import embedding_functions
from typing import List, Dict

# Hardcoded expansions as fallback/core logic
# Ideally this matches what's in your all_expansions.yaml
BASE_ABBREV_EXPANSIONS = {
  "OXT": "oxytocin, social bonding, warmth, trust, co-regulation",
  "OXYTOCIN": "oxytocin, social bonding, warmth, trust, co-regulation",
  "VASOPRESSIN": "vasopressin, territorial guard, protective vigilance",
  "KISSPEPTIN": "kisspeptin, reproductive axis activation, erotic readiness",
  "THYROID": "thyroid tempo, metabolic pace, emotional tempo of life",
  "5HT": "serotonin tone, warmth, mood, safety, calm",
  "5HT1A": "serotonin 5HT1A receptor, anxiety soothing, calm body, safety",
  "5HT2A": "serotonin 5HT2A receptor, visionary perception, pattern amplification, psychedelic-style meaning intensity",
  "5HT2C": "serotonin 5HT2C receptor, appetite and impulse shaping, tension regulation",
  "5HT3": "serotonin 5HT3 receptor, gut sensation, nausea, visceral alerting",
  "5HT7": "serotonin 5HT7 receptor, circadian timing, rhythm of mood",
  "DA": "dopamine signal, wanting, pursuit, reward prediction, curiosity",
  "D1": "dopamine D1 receptor, focused drive, linear execution, goal pursuit",
  "D2": "dopamine D2 receptor, checking, doubt, braking, reality testing",
  "NE": "norepinephrine arousal, alertness, vigilance, threat scanning",
  "GABA": "GABAergic inhibition, nervous system brake, calm and quieting",
  "NMDA": "NMDA glutamate receptor, learning, plasticity, deep encoding",
  "CB1": "endocannabinoid CB1 receptor, melt, soft body, pain gating, dreamy spacing out",
  "MOR": "mu opioid receptor, pleasure, comfort, warm analgesia",
  "KOR": "kappa opioid receptor, dysphoria, ache, punishment-tone, edge of suffering",
  "TAAR": "trace amine associated receptors, weird salience, edgy stimulation, chemical oddness",
  "TAAR1": "TAAR1 receptor, amphetamine-like salience, jittery excitement",
  "SERT": "serotonin transporter, reuptake, warmth and calm echo length; reversal causes flood and manic connectivity",
  "DAT": "dopamine transporter, reuptake of drive; reversal causes intense restless drive and perseveration",
  "NET": "norepinephrine transporter, alert edge decay; reversal keeps threat and focus looping",
  "VMAT2": "vesicular monoamine transporter 2, emotional payload packing, charge build-up",
  "MAO_A": "monoamine oxidase A, breakdown of monoamines, tone reset; low levels extend intensity",
  "COMT": "COMT enzyme, cortical dopamine clearance; affects executive tone",
  "EAAT2": "EAAT2 glutamate transporter, glutamate clean-up; prevents excitotoxic stress",
  "KCC2": "KCC2 chloride exporter, makes GABA more inhibitory, stabilizing",
  "NKCC1": "NKCC1 chloride importer, makes GABA more excitatory in immature or stressed states",
  "DMT": "endogenous dimethyltryptamine, numinous imagery, vivid visionary salience, brief overwhelming meaning bursts",
  "THC": "THC-like cannabinoid tone, altered time sense, sensory melt, CB1 heavy state",
  "PFC": "prefrontal cortex, executive control, planning, narrative self",
  "DLPFC": "dorsolateral prefrontal cortex, working memory, top-down control",
  "ACC": "anterior cingulate cortex, conflict monitor, error feelings, salience of mismatch",
  "NACC": "nucleus accumbens, ventral striatum, reward hub, motivational pull",
  "AMYGDALA": "amygdala, emotional salience, especially fear and threat"
}

# (You can add the other large dicts here if you want them hardcoded in the embedder)

[docs] class NCMSemanticPreprocessor: def __init__(self, expansions: Dict[str, str] = None): self.expansions = BASE_ABBREV_EXPANSIONS.copy() if expansions: self.expansions.update(expansions) # Pre-compile regex for performance # Matches whole words only to avoid replacing parts of other words self.pattern = re.compile(r'\b(' + '|'.join(re.escape(k) for k in self.expansions.keys()) + r')\b')
[docs] def expand(self, text: str) -> str: """ Replaces NCM sigils/acronyms with their full semantic definitions. Appends the definition in parens rather than replacing to keep original context. Example: "High D1 state" -> "High D1 (dopamine D1 receptor, focused drive...) state" """ def replace_match(match): key = match.group(0) definition = self.expansions.get(key, "") return f"{key} ({definition})" return self.pattern.sub(replace_match, text)
[docs] class EnhancedLocalNCMEmbedder(embedding_functions.EmbeddingFunction): def __init__(self, model_name: str = "all-mpnet-base-v2"): # We use the standard SentenceTransformer but wrap it with our preprocessor self.ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name=model_name) self.preprocessor = NCMSemanticPreprocessor()
[docs] def name(self) -> str: return "ncm_enhanced_mpnet"
def __call__(self, input: chromadb.Documents) -> chromadb.Embeddings: # 1. Pre-process (Expand Sigils) expanded_input = [self.preprocessor.expand(doc) for doc in input] # 2. Embed (Standard Model) # The model now sees "D1 (dopamine drive...)" instead of just "D1" return self.ef(expanded_input)