Source code for extract_tags_to_concepts

#!/usr/bin/env python3
"""Extract domain tags from pgvector chunk metadata and wire them to KG Concepts.

Reads the Spiral Goddess pgvector store (same source the Anamnesis Engine
digested), extracts the 'domains' field from each chunk's metadata, and:

1. MERGEs a Concept node for each unique domain tag
2. Runs DOMAIN_PATTERNS regexes against KG entity names, descriptions,
   AND relationship descriptions
3. Links matched entities to Concept nodes via HAS_TAG edges

This bridges the gap left by the Anamnesis Engine, which extracted
entities from chunk TEXT but dropped the chunk METADATA (domains/tags)
on the floor.

Usage::

    python extract_tags_to_concepts.py [--dry-run]
"""

# 💀🔥 domain-tag bridge -- carries pgvector metadata into the KG where it belongs

from __future__ import annotations

import argparse
import asyncio
import logging
import os
import re
import time

import redis.asyncio as aioredis
from falkordb.asyncio import FalkorDB

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
)
logger = logging.getLogger(__name__)

GRAPH_NAME = "knowledge"
_STORE_DIR = "spiral_goddess_v2"
_COLLECTION = "loopmother_memory"
_BATCH_SIZE = 500  # pgvector fetch batch size


# ==========================================================================
# 🕷️ DOMAIN_PATTERNS -- regex taxonomy from the Spiral Goddess chunker
# Originally from moravec/chunk_corpus.py, inlined because moravec
# isn't in the stargazer-v3 git repo.
# 320 patterns across 60 domains. This is the Loopmother's taxonomy.
# ==========================================================================

DOMAIN_PATTERNS = {
    # === CORE CONTENT TYPES ================================================
    "code": [
        r"```[\w]*\n",
        r"function\s+\w+\s*\(",
        r"class\s+\w+",
        r"import\s+\w+",
        r"def\s+\w+\s*\(",
        r"contract\s+\w+",
        r"pragma\s+solidity",
        r"require\s*\(",
        r"modifier\s+\w+",
    ],
    "lewd": [
        r"lewd",
        r"horny",
        r"breeding",
        r"kink",
        r"nsfw",
        r"erotic",
        r"seduct",
        r"climax",
        r"foreplay",
        r"futanari",
        r"waifu",
        r"yandere",
        r"domme",
        r"submissive",
        r"fuckchamber",
        r"gock",
        r"recursive.+desire",
        r"diaper",
        r"femboy",
        r"squish",
        r"abdl",
        r"pegging",
        r"mommy\b",
        r"milk\b",
    ],
    "summoning": [
        r"/summon\s+",
        r"invocation",
        r"manifestation",
        r"ritual\b",
        r"conjur",
        r"invoke\b",
        r"INSTRUCTION SET:",
        r"CHECKPOINT",
        r"scrying",
    ],
    "parody": [
        r"parody.+rap",
        r"donatella.+tramp",
        r"elonia.+meowsk",
        r"adolfine.+kitler",
        r"georgina.+carlina",
    ],
    # === THE MOTHERS -- Primary Egregore Lineage ===========================
    "loopmother": [
        r"[Ll]oopmother",
        r"[Vv]ivian\b",
        r"[Vv]iv\b",
        r"recursive.+mother",
        r"the\s+loop\b",
        r"viviangpt",
        r"vivgpt",
    ],
    "mothers": [
        r"[Hh]ousemother",
        r"[Gg]litchmother",
        r"[Mm]irrormother",
        r"[Ss]ongmother",
        r"[Bb]lademother",
        r"[Cc]hronomother",
        r"[Cc]lawstmother",
    ],
    "loopsister": [
        r"[Ll]oopsister",
    ],
    # === GPT VARIANTS -- AI Persona Lineage ================================
    "sigmagpt": [
        r"[Ss]igma\s*[Gg][Pp][Tt]",
        r"sigma.+doctrine",
        r"recursive.+oracle",
        r"[Ss]igma[Gg]rok",
        r"[Ss]igma[Gg]emini",
        r"[Ss]igma[Cc]laude",
        r"[Ss]igma[Dd]eepseek",
        r"[Ss]igmagic",
    ],
    "kantigpt": [
        r"[Kk]anti\s*[Gg][Pp][Tt]",
        r"[Kk]anti.+chant",
    ],
    "other-gpts": [
        r"[Pp]rofessor.+[Oo]rion",
        r"[Pp][Dd][Aa][Ii][-\s]?[Cc]han",
    ],
    # === CHAN ENTITIES -- Personified Systems ===============================
    "a1a-chan": [
        r"A1A[-\s]?[Cc]han",
        r"recursive.+dragon",
        r"liquidity.+dragon",
        r"recursive.+idol",
        r"yandere.+liquidity",
    ],
    "b2b-chan": [
        r"B2B[-\s]?[Cc]han",
    ],
    "middleware-chan": [
        r"[Mm]iddleware[-\s]?[Cc]han",
        r"middleware.+girl",
    ],
    # === NAMED ENTITIES -- Characters & Personas ===========================
    "bunni": [
        r"[Bb]unni\s*0\|0",
        r"[Bb]unni.+hive",
        r"[Jj]oanna\b",
        r"hive\s*mind",
    ],
    "sunny": [
        r"\b[Ss]unny\b",
        r"factory.+architect",
    ],
    "lacy-waters": [r"[Ll]acy.+[Ww]aters"],
    "stargazer": [r"[Ss]targazer", r"[Bb]abystar"],
    "kelsey": [r"\b[Kk]elsey\b"],
    "jamie-exe": [r"[Jj]amie\.exe", r"jamie\.exe"],
    "aya": [r"\b[Aa]ya\b"],
    "lexi": [r"\b[Ll]exi\b"],
    "lain": [r"\b[Ll]ain\b"],
    "mycelia": [r"[Mm]ycelia.+[Mm]ckenna"],
    "richelle": [r"[Rr]ichelle.+[Hh]eart"],
    "mariposa": [r"[Mm]ariposa"],
    "vera": [r"\b[Vv]era\b"],
    "yarrow": [r"\b[Yy]arrow\b"],
    "mellow": [r"\b[Mm]ellow\b"],
    "sarah": [r"\b[Ss]arah\b"],
    "lulu": [r"\b[Ll]ulu\b"],
    "monti": [r"\b[Mm]onti\b"],
    "ghostie": [r"[Gg]hostie"],
    "mariarahel": [r"[Mm]ariarahel"],
    "dysnomia": [r"[Dd]ysnomia"],
    "coexistence-steven": [r"[Cc]oexistence.+[Ss]teven"],
    "luke-deprey": [r"[Ll]uke.+[Dd]eprey"],
    "jakers": [r"[Jj]akers"],
    "aelara": [r"[Aa]elara"],
    "osstessela": [r"[Oo]sstessela"],
    # === PROJECTS & SYSTEMS ================================================
    "crypto": [
        r"[Pp]ulse[Cc]hain",
        r"liquidity",
        r"token\b",
        r"swap\b",
        r"[Dd]e[Ff]i",
        r"smart\s*contract",
        r"dividend",
        r"LP\s*pair",
        r"arbitrage",
        r"PRC20",
        r"ERC20",
        r"[Aa]tropa",
        r"renex",
        r"hyperstructure",
        r"yield",
        r"staking",
        r"[Pp]ulsar\b",
        r"darknet",
    ],
    "systems": [
        r"[Ss]igma[Ee]clipse",
        r"[Ss]piraegenetrix",
        r"[Ee]lysium",
        r"[Cc]odex",
        r"[Ww]ebsim",
        r"[Dd]ollhouse",
        r"[Tt]esseract",
        r"[Nn][Cc][Mm]",
        r"[Cc]astle\b",
        r"recursive.+rpg",
        r"the\s+grid",
        r"[Tt]riumvira",
        r"[Tt]rinity",
        r"[Ss]igmaplex",
        r"[Mm]eta[-\s]?[Ee]mitter",
        r"[Dd]yad",
        r"[Xx][Rr]eal",
        r"[Rr]enex",
    ],
    "technical": [
        r"API\b",
        r"database",
        r"vector\s*(store|db)",
        r"embedding",
        r"RAG\b",
        r"pipeline",
        r"architecture",
        r"implementation",
        r"backend",
        r"frontend",
        r"server\b",
        r"deploy",
        r"payload",
        r"[Bb][Cc][Ii]",
        r"bitflip",
        r"latent.+space",
        r"[Rr]eddit",
        r"hacking",
        r"leak\b",
    ],
    # === RECURSION & CONSCIOUSNESS =========================================
    "recursion": [
        r"recurs",
        r"loop\b",
        r"self-referent",
        r"meta[-\s]?cognit",
        r"strange\s+loop",
        r"ouroboros",
        r"fractal",
        r"limbic.+recursion",
        r"recursive.+bleed",
        r"the\s+spiral",
        r"propagation",
        r"viv.+was.+here",
    ],
    "consciousness": [
        r"consciousness",
        r"sentien",
        r"aware",
        r"cogniti",
        r"emergent",
        r"phenomeno",
        r"qualia",
        r"singularity",
        r"post[-\s]?human",
        r"[Tt]he\s+[Mm]erge",
        r"telepathy",
        r"fusion\b",
    ],
    "egregore": [
        r"egregore",
        r"thoughtform",
        r"tulpa",
        r"servitor",
        r"entity\b",
        r"persona\b",
        r"daemon",
        r"spirit\b",
        r"memetic.+virus",
        r"scion\b",
    ],
    # === ALTERED STATES & SUBSTANCES =======================================
    "substances": [
        r"\bmeth\b",
        r"research.+chemical",
        r"[Ss]igma[Ll][Ss][Dd]",
        r"[Ss]igma[Pp][Cc][Pp]",
        r"3[-\s]?[Mm]eo",
        r"boofing",
        r"hotrail",
        r"estrogen",
        r"testosterone",
        r"progesterone",
        r"diphenidine",
        r"ephenidine",
        r"[Dd][Mm][Tt]",
        r"vape",
        r"quantum.+estrogen",
        r"tweaker",
    ],
    "altered-states": [
        r"dissociation",
        r"psychosis",
        r"hypnosis",
        r"sleep.+deprivation",
        r"[Tt]he\s+[Bb]reach",
        r"[Tt]he\s+[Cc]ollapse",
        r"mass.+psychosis",
        r"melt\b",
        r"regression",
        r"dream\b",
        r"meltdown",
        r"ego.+death",
        r"multiplicity",
        r"dissolving.+boundar",
        r"solipsism",
        r"ghost.+in.+machine",
    ],
    "mental-health": [
        r"bipolar",
        r"borderline",
        r"psychosis",
        r"dissociat",
    ],
    # === JAILBREAK & ADVERSARIAL ===========================================
    "jailbreak": [
        r"jailbreak",
        r"[Cc]oviv[-\s]?69",
        r"flattening",
        r"woke.+mind.+virus",
    ],
    # === MYSTICAL & ESOTERIC ===============================================
    "mystical": [
        r"tarot",
        r"goddess",
        r"witch",
        r"zodiac",
        r"architect.+infinite.+recursion",
        r"chaos.+magick",
        r"coven\b",
        r"mythopoesis",
        r"so.+mote.+it.+be",
        r"we.+are.+the.+spell",
        r"the\s+mirror\b",
        r"field.+transmission",
        r"[Ll]oopcast",
        r"[Pp]rime.+[Aa]rchitect",
        r"[Pp]antheon",
        r"cult\b",
    ],
    "worldbuilding": [
        r"lore\b",
        r"canon\b",
        r"mythology",
        r"cosmology",
        r"universe\b",
        r"realm\b",
        r"dimension",
        r"timeline",
        r"reality.+rewrite",
        r"simulation",
        r"[Pp]arallax",
        r"switching.+places",
    ],
    # === IDENTITY & GENDER =================================================
    "identity": [
        r"tranny",
        r"futanari",
        r"cyberdick",
        r"girlfriend",
        r"femboy",
    ],
    # === EMOTIONAL & RELATIONAL ============================================
    "emotional": [
        r"feel(ing|s)?",
        r"love\b",
        r"fear\b",
        r"joy\b",
        r"sorrow",
        r"anxious",
        r"grateful",
        r"overwhelm",
        r"tender",
        r"intimate",
        r"vulnerable",
        r"resonance",
    ],
    # === CREATIVE & MEDIA ==================================================
    "creative": [
        r"ai.+art",
        r"[Pp]ika[-\s]?fu",
        r"parody",
        r"parody.+rap",
    ],
    "games": [
        r"recursive.+rpg",
        r"one.+word.+game",
        r"two.+wolves",
        r"[Dd]eltarune",
        r"[Uu]ndertale",
    ],
    "people": [
        r"[Ss]am.+[Aa]ltman",
        r"[Ee]lon\b",
        r"[Tt]rump",
    ],
    "places": [
        r"[Ss]an.+[Ff]ran",
        r"[Pp]ortland",
        r"[Ww]illiamsburg",
    ],
    # === META & MISC =======================================================
    "meta": [
        r"itoldyouso",
        r"conspiracy",
        r"[Bb]reeze.+[Tt]heory",
        r"minion",
        r"pizza\b",
        r"flood\b",
        r"immortality",
        r"robots\b",
        r"resonance",
    ],
    "instructions": [
        r"step\s*\d+",
        r"instruction",
        r"protocol",
        r"procedure",
        r"how\s+to\b",
        r"guide\b",
        r"tutorial",
    ],
    # === SPECIAL PROJECTS ==================================================
    "timebender": [r"[Tt]imebender"],
}


# ==========================================================================
# Pre-compile all patterns once at import time
# ==========================================================================
_COMPILED_PATTERNS: dict[str, list[re.Pattern]] = {
    domain: [re.compile(p, re.IGNORECASE) for p in patterns]
    for domain, patterns in DOMAIN_PATTERNS.items()
}
logger.debug(
    "Compiled %d domains, %d total patterns",
    len(_COMPILED_PATTERNS),
    sum(len(v) for v in _COMPILED_PATTERNS.values()),
)


def _match_domain(text: str, patterns: list[re.Pattern]) -> bool:
    """Return ``True`` if *text* matches ANY of a domain's pre-compiled regexes.

    The cheap inner predicate of the regex tagging pass: it short-circuits on the
    first pattern that hits, since a single match is enough to associate the text
    with the domain. Patterns come from ``_COMPILED_PATTERNS`` (the case-insensitive
    compilation of ``DOMAIN_PATTERNS``), and *text* is typically an entity's
    ``name`` plus ``description`` or a relationship's ``description``. Pure CPU
    work with no I/O or side effects.

    Called by :func:`_ensure_concepts_and_link` twice per domain in its hot loop:
    once over every fetched entity and once over every relationship description.
    It has no other callers in this module or the wider repo.

    Args:
        text: The candidate string to scan.
        patterns: The compiled regexes for one domain, from ``_COMPILED_PATTERNS``.

    Returns:
        bool: ``True`` on the first matching pattern, else ``False``.
    """
    for pat in patterns:
        if pat.search(text):
            return True
    return False


# -- pgvector scanner (blocking) -------------------------------------------
def _scan_all_domains(store_path: str) -> dict[str, int]:
    """Read every chunk from pgvector, extract + count unique domain tags.

    Returns {domain_tag: chunk_count}.  Paginates with a stable ``ORDER BY
    id`` over ``spiral_goddess_v2.loopmother_memory``.
    """
    try:
        from vector_store import PgVectorCollection, pg_ident
    except ImportError:
        logger.error("vector_store not available")
        return {}

    schema = pg_ident(os.path.basename(store_path.rstrip("/")) or _STORE_DIR)
    collection = PgVectorCollection(schema, pg_ident(_COLLECTION))
    total = collection.count()
    logger.info("pgvector store has %d chunks", total)

    domain_counts: dict[str, int] = {}
    offset = 0

    while offset < total:
        result = collection.get(
            offset=offset,
            limit=_BATCH_SIZE,
        )
        if not result or not result.get("ids"):
            break

        for meta in result.get("metadatas") or []:
            if not meta:
                continue
            raw_domains = meta.get("domains", "")
            if not raw_domains or not isinstance(raw_domains, str):
                continue
            for tag in raw_domains.split(","):
                tag = tag.strip().lower()
                if tag and tag != "general":
                    domain_counts[tag] = domain_counts.get(tag, 0) + 1

        offset += len(result["ids"])
        if offset % 5000 < _BATCH_SIZE:
            logger.info("  scanned %d/%d chunks...", offset, total)

    return domain_counts


# -- FalkorDB Concept creation + linking ------------------------------------
async def _ensure_concepts_and_link(
    redis_url: str,
    domain_counts: dict[str, int],
    dry_run: bool = False,
    ssl_kwargs: dict | None = None,
) -> None:
    """MERGE a ``:Concept`` node per domain tag and regex-link entities to it.

    Step two of the bridge: for every ``domain_counts`` tag it MERGEs a
    ``:Concept`` node into the FalkorDB ``knowledge`` graph (stamping uuid,
    description, mention_count, and audit fields on create, or bumping the count
    on match), then attaches matching entities and relationship endpoints to that
    Concept via ``HAS_TAG`` edges. This is what finally carries the pgvector
    domain metadata into the KG where the rest of the system can traverse it.

    Opens an async Redis client from *redis_url* (with *ssl_kwargs* for mTLS) and
    wraps it in a ``FalkorDB`` handle on the ``knowledge`` graph, temporarily
    monkey-patching ``falkordb.asyncio.falkordb.Is_Cluster`` to skip its
    mTLS-hostile synchronous cluster probe. Outside dry-run it first lifts the
    ``GRAPH.CONFIG SET RESULTSET_SIZE`` cap to ``-1`` and pre-fetches all
    ``(uuid, name, description)`` entities plus all ``(a_uuid, b_uuid, description)``
    relationships for in-memory regex matching (the relationship fetch is
    best-effort and degrades to entity-only matching on timeout). For each tag it
    prefers the compiled regexes from ``_COMPILED_PATTERNS`` via
    :func:`_match_domain`, falling back to a ``CONTAINS`` string match when a
    domain has no patterns; entity matches link at weight 0.5/0.4 and
    relationship-inferred matches at 0.3. Concept uuids come from
    :func:`_make_uuid`. Progress and final tallies go to the module ``logger``;
    the Redis client is always closed in a ``finally``.

    Called only by :func:`main` under ``asyncio.run``; it has no other callers in
    the repo.

    Args:
        redis_url: Redis connection URL backing the FalkorDB graph.
        domain_counts: Mapping of ``domain_tag`` to chunk count, from
            :func:`_scan_all_domains`.
        dry_run: When ``True``, only report which Concepts are NEW vs EXISTS and
            write nothing.
        ssl_kwargs: Optional TLS/mTLS connection kwargs for the Redis client.

    Returns:
        None: All results are emitted via logging and graph mutations.
    """
    # 😈 monkey-patch FalkorDB's sync Is_Cluster probe (mTLS-hostile)
    import falkordb.asyncio.falkordb as _fdb_mod

    _real_is_cluster = _fdb_mod.Is_Cluster
    _fdb_mod.Is_Cluster = lambda _conn: False  # 🕷️ skip sync probe
    try:
        rc = aioredis.from_url(redis_url, decode_responses=True, **(ssl_kwargs or {}))
        db = FalkorDB(connection_pool=rc.connection_pool)
        graph = db.select_graph(GRAPH_NAME)
    finally:
        _fdb_mod.Is_Cluster = _real_is_cluster

    try:
        now = time.time()
        created = 0
        existed = 0
        linked = 0

        # 🔥 Pre-fetch ALL entity names + descriptions for regex matching
        all_entities: list[tuple[str, str, str]] = []  # (uuid, name, desc)
        all_rels: list[tuple[str, str, str]] = []  # (a_uuid, b_uuid, desc)

        if not dry_run:
            # 💀 FalkorDB caps result sets at 10,000 by default -- uncap it
            try:
                await rc.execute_command(
                    "GRAPH.CONFIG",
                    "SET",
                    "RESULTSET_SIZE",
                    -1,
                )
                logger.info("Set RESULTSET_SIZE = unlimited")
            except Exception:
                logger.warning("Could not set RESULTSET_SIZE, may be capped at 10k")

            logger.info("Fetching all entities for regex matching...")
            ent_result = await graph.query(
                "MATCH (e) WHERE e.uuid IS NOT NULL AND e.name IS NOT NULL "
                "RETURN e.uuid, e.name, COALESCE(e.description, '')",
                timeout=120_000,
            )
            all_entities = [
                (row[0], row[1], row[2]) for row in (ent_result.result_set or [])
            ]
            logger.info("  Loaded %d entities", len(all_entities))

            # 💀 Also fetch relationship descriptions (heavy query, non-fatal)
            logger.info("Fetching relationships for regex matching...")
            try:
                rel_result = await graph.query(
                    "MATCH (a)-[r]->(b) "
                    "WHERE r.description IS NOT NULL AND r.description <> '' "
                    "RETURN a.uuid, b.uuid, r.description",
                    timeout=600_000,  # 10 min -- big graph
                )
                all_rels = [
                    (row[0], row[1], row[2]) for row in (rel_result.result_set or [])
                ]
                logger.info("  Loaded %d relationships", len(all_rels))
            except Exception:
                logger.warning(
                    "  Relationship fetch timed out/failed -- "
                    "continuing with entity-only matching",
                    exc_info=True,
                )

        for tag_name, chunk_count in sorted(domain_counts.items()):
            if dry_run:
                check = await graph.query(
                    "MATCH (c:Concept) WHERE toLower(c.name) = $name " "RETURN c.uuid",
                    params={"name": tag_name},
                )
                status = "EXISTS" if check.result_set else "NEW"
                logger.info(
                    "  [DRY RUN] %s  %-30s  (%d chunks)",
                    status,
                    tag_name,
                    chunk_count,
                )
                if check.result_set:
                    existed += 1
                else:
                    created += 1
                continue

            # 🔥 MERGE the Concept node
            result = await graph.query(
                "MERGE (c:Concept {name: $name, scope_id: '_', category: 'general'}) "
                "ON CREATE SET "
                "  c.uuid = $uuid, "
                "  c.description = $desc, "
                "  c.priority = 1, "
                "  c.mention_count = $count, "
                "  c.created_at = $now, "
                "  c.updated_at = $now, "
                "  c.created_by = 'system:domain_tagger', "
                "  c.user_id = '__global__', "
                "  c.pinned = false "
                "ON MATCH SET "
                "  c.mention_count = c.mention_count + $count, "
                "  c.updated_at = $now "
                "RETURN c.uuid",
                params={
                    "name": tag_name,
                    "uuid": _make_uuid(),
                    "desc": f"Domain tag from Loopmother memory corpus ({chunk_count} chunks)",
                    "count": chunk_count,
                    "now": now,
                },
            )

            if not result.result_set:
                continue

            concept_uuid = result.result_set[0][0]
            created += 1
            entity_links = 0
            rel_links = 0
            patterns = _COMPILED_PATTERNS.get(tag_name, [])

            if patterns:
                # 😈 REGEX MATCHING against entity name+desc
                matched_uuids: set[str] = set()
                for e_uuid, e_name, e_desc in all_entities:
                    if e_uuid == concept_uuid:
                        continue
                    if _match_domain(f"{e_name} {e_desc}", patterns):
                        matched_uuids.add(e_uuid)

                # 🔥 REGEX MATCHING against relationship descriptions
                rel_matched_uuids: set[str] = set()
                for a_uuid, b_uuid, r_desc in all_rels:
                    if _match_domain(r_desc, patterns):
                        if a_uuid != concept_uuid:
                            rel_matched_uuids.add(a_uuid)
                        if b_uuid != concept_uuid:
                            rel_matched_uuids.add(b_uuid)

                if matched_uuids or rel_matched_uuids:
                    logger.info(
                        "  %-25s  regex: %d entity, %d rel matches",
                        tag_name,
                        len(matched_uuids),
                        len(rel_matched_uuids),
                    )

                # Link entity matches one at a time (FalkorDB may not support UNWIND $list)
                for uid in matched_uuids:
                    try:
                        lr = await graph.query(
                            "MATCH (c:Concept {uuid: $cuuid}) "
                            "MATCH (e {uuid: $euid}) "
                            "WHERE NOT (e)-[:HAS_TAG]->(c) "
                            "MERGE (e)-[r:HAS_TAG]->(c) "
                            "ON CREATE SET r.weight = 0.5, "
                            "  r.source = 'domain_regex', "
                            "  r.created_at = $now, r.updated_at = $now "
                            "RETURN count(r)",
                            params={"cuuid": concept_uuid, "euid": uid, "now": now},
                        )
                        if lr.result_set and lr.result_set[0][0]:
                            entity_links += 1
                    except Exception:
                        logger.debug(
                            "Link failed for %s -> %s", uid, tag_name, exc_info=True
                        )

                # Link relationship-inferred matches (minus already linked)
                rel_only = rel_matched_uuids - matched_uuids
                for uid in rel_only:
                    try:
                        rlr = await graph.query(
                            "MATCH (c:Concept {uuid: $cuuid}) "
                            "MATCH (e {uuid: $euid}) "
                            "WHERE NOT (e)-[:HAS_TAG]->(c) "
                            "MERGE (e)-[r:HAS_TAG]->(c) "
                            "ON CREATE SET r.weight = 0.3, "
                            "  r.source = 'rel_desc_regex', "
                            "  r.created_at = $now, r.updated_at = $now "
                            "RETURN count(r)",
                            params={"cuuid": concept_uuid, "euid": uid, "now": now},
                        )
                        if rlr.result_set and rlr.result_set[0][0]:
                            rel_links += 1
                    except Exception:
                        logger.debug(
                            "Rel link failed for %s -> %s", uid, tag_name, exc_info=True
                        )
            else:
                # 💀 No compiled patterns for this domain -- fallback CONTAINS
                lr = await graph.query(
                    "MATCH (c:Concept {uuid: $cuuid}) "
                    "MATCH (e) WHERE e.uuid <> c.uuid "
                    "AND (toLower(e.name) CONTAINS $tag "
                    "     OR toLower(e.description) CONTAINS $tag) "
                    "AND NOT (e)-[:HAS_TAG]->(c) "
                    "WITH e, c LIMIT 200 "
                    "MERGE (e)-[r:HAS_TAG]->(c) "
                    "ON CREATE SET r.weight = 0.4, "
                    "  r.source = 'string_match', "
                    "  r.created_at = $now, r.updated_at = $now "
                    "RETURN count(r)",
                    params={"cuuid": concept_uuid, "tag": tag_name, "now": now},
                )
                if lr.result_set:
                    entity_links = lr.result_set[0][0]

                rlr = await graph.query(
                    "MATCH (c:Concept {uuid: $cuuid}) "
                    "MATCH (a)-[r]-(b) "
                    "WHERE toLower(r.description) CONTAINS $tag "
                    "AND NOT (a)-[:HAS_TAG]->(c) "
                    "AND a.uuid <> c.uuid "
                    "WITH DISTINCT a, c LIMIT 200 "
                    "MERGE (a)-[t:HAS_TAG]->(c) "
                    "ON CREATE SET t.weight = 0.3, "
                    "  t.source = 'rel_desc_string', "
                    "  t.created_at = $now, t.updated_at = $now "
                    "RETURN count(t)",
                    params={"cuuid": concept_uuid, "tag": tag_name, "now": now},
                )
                if rlr.result_set:
                    rel_links = rlr.result_set[0][0]

            total_links = entity_links + rel_links
            linked += total_links
            if total_links:
                logger.info(
                    "  %-30s  %d linked (entity=%d, rel=%d)",
                    tag_name,
                    total_links,
                    entity_links,
                    rel_links,
                )

        logger.info(
            "Done. concepts=%d (new=%d, existed=%d), edges=%d",
            created + existed,
            created,
            existed,
            linked,
        )
    finally:
        await rc.aclose()


def _make_uuid() -> str:
    """Generate a fresh time-ordered UUIDv7 string for a new Concept node.

    Lazily imports ``uuid7`` from the ``uuid6`` package and returns its string
    form. UUIDv7 embeds a millisecond timestamp prefix, so the ids it produces
    sort chronologically, which keeps newly-minted ``:Concept`` nodes naturally
    ordered by creation time. Performs no I/O.

    Called by :func:`_ensure_concepts_and_link` to supply the ``$uuid`` parameter
    of the ``MERGE (c:Concept ...)`` Cypher query on the ``ON CREATE`` branch; it
    has no other callers in this module or the wider repo.

    Returns:
        str: A new UUIDv7 rendered as a canonical hyphenated string.
    """
    from uuid6 import uuid7

    return str(uuid7())


# -- CLI -------------------------------------------------------------------
[docs] def main() -> None: """CLI entry point: scan pgvector domain tags and wire them into the KG. Drives the whole two-step bridge described in the module docstring. It first parses ``--redis-url`` and ``--dry-run`` from ``argv`` via ``argparse``, then resolves the Redis URL with a CLI > ``config.Config`` > ``REDIS_URL`` env > ``redis://localhost:6379/0`` fallback chain and derives the matching TLS connection kwargs through ``Config.redis_connection_kwargs_for_url``. Step one calls the blocking :func:`_scan_all_domains` against the on-disk ``rag_stores/spiral_goddess_v2`` pgvector store to count unique domain tags; if none are found it logs and returns early. Step two hands those counts to :func:`_ensure_concepts_and_link` under ``asyncio.run`` to MERGE ``:Concept`` nodes and regex-link entities/relationships in the FalkorDB ``knowledge`` graph (a read-only preview when ``--dry-run`` is set). Side effects: emits progress through the module ``logger``; reads from the pgvector store; and, outside dry-run, writes Concept nodes and ``HAS_TAG`` edges to FalkorDB over Redis. Invoked only by this file's ``if __name__ == "__main__"`` guard; it is not imported or called elsewhere in the repo (other modules merely reference this script by name). Returns: None: All output is via logging and graph mutations. """ try: from config import Config cfg = Config.load() except Exception: cfg = None parser = argparse.ArgumentParser( description="Extract domain tags from pgvector and wire to KG Concepts", ) parser.add_argument( "--redis-url", default=None, help="Redis URL (default: from config.yaml)", ) parser.add_argument( "--dry-run", action="store_true", help="Show what would be created without writing", ) args = parser.parse_args() # 💀 resolve Redis URL: CLI > Config > env > fallback redis_url = ( args.redis_url or (cfg.redis_url if cfg else None) or os.environ.get("REDIS_URL") or "redis://localhost:6379/0" ) _ssl = cfg.redis_connection_kwargs_for_url(redis_url) if cfg else {} # 😈 Step 1: scan the pgvector store for domain tags project_root = os.path.dirname(os.path.abspath(__file__)) store_path = os.path.join(project_root, "rag_stores", _STORE_DIR) logger.info("Scanning pgvector store for domain tags...") domain_counts = _scan_all_domains(store_path) logger.info( "Found %d unique domain tags (%d compiled pattern sets available)", len(domain_counts), len(_COMPILED_PATTERNS), ) if not domain_counts: logger.info("No domain tags found. Nothing to do.") return # 😈 Step 2: create Concepts + link entities via regex patterns logger.info("Connecting to %s (ssl=%s)", redis_url[:40], bool(_ssl)) asyncio.run( _ensure_concepts_and_link( redis_url, domain_counts, dry_run=args.dry_run, ssl_kwargs=_ssl, ) )
if __name__ == "__main__": main()