Source code for classifiers.skill_catalog

"""SQLite-backed index for Agent Skills (SKILL.md discovery and metadata)."""

from __future__ import annotations

import hashlib
import logging
import re
import sqlite3
import time
from pathlib import Path
from typing import Any

import yaml

logger = logging.getLogger(__name__)

# Skip paths that cannot contain skills (aligned with agentskills client guide).
_SKIP_DIR_NAMES = frozenset(
    {
        ".git",
        "node_modules",
        "__pycache__",
        ".venv",
        "venv",
        "dist",
        "build",
        ".mypy_cache",
        ".pytest_cache",
    }
)

_MAX_SCAN_DEPTH = 8

# Hidden dirs are skipped except `.agents` — `npx skills add` installs under `.agents/skills/`.
_ALLOW_DOT_DIR_NAMES = frozenset({".agents"})


def _parse_skill_md(path: Path) -> dict[str, Any] | None:
    """Parse one SKILL.md file's YAML frontmatter and body.

    Reads the file, splits the leading ``---`` fenced YAML frontmatter from the
    markdown body, and pulls out the required ``name`` and ``description``
    fields. This is the single-file parsing primitive behind skill discovery:
    a file that is not a well-formed skill (no frontmatter, malformed YAML,
    missing/empty name or description) yields ``None`` so the caller can skip
    it. It also fingerprints the body with a SHA-256 hash used downstream to
    dedupe identical skills across corpora.

    Touches the filesystem (reads *path*) and logs warnings on read errors and
    debug on YAML errors; it does no network, Redis, or database I/O.

    Called by ``classifiers.ingest_skills`` while walking the discovered skill
    directories. No other in-repo callers were found.

    Args:
        path: Filesystem path to a candidate ``SKILL.md`` file.

    Returns:
        A dict with ``name``, ``description``, ``body``, and ``body_hash`` keys
        for a valid skill, or ``None`` when the file is unreadable or is not a
        complete skill definition.
    """
    try:
        raw = path.read_text(encoding="utf-8", errors="replace")
    except OSError as exc:
        logger.warning("Cannot read %s: %s", path, exc)
        return None

    if not raw.startswith("---"):
        return None

    m = re.match(
        r"^---\s*\n(.*?)\n---\s*\n(.*)$",
        raw,
        re.DOTALL,
    )
    if not m:
        return None

    fm_raw, body = m.group(1), m.group(2)
    try:
        fm = yaml.safe_load(fm_raw) or {}
    except yaml.YAMLError as exc:
        logger.debug("YAML frontmatter failed %s: %s", path, exc)
        return None

    if not isinstance(fm, dict):
        return None

    name = fm.get("name")
    description = fm.get("description")
    if not name or not description:
        return None
    name = str(name).strip()
    description = str(description).strip()
    if not name or not description:
        return None

    body_hash = hashlib.sha256(body.encode("utf-8")).hexdigest()
    return {
        "name": name,
        "description": description,
        "body": body,
        "body_hash": body_hash,
    }



[docs]
def stable_skill_id(skill_root: Path, corpus_root: Path) -> str:
    """Derive a deterministic skill ID from its path relative to the corpus.

    Produces a stable, content-independent identifier for a skill so the same
    skill keeps the same primary key across re-ingests (it depends only on
    location, not on the skill body). The relative posix path under
    *corpus_root* is hashed with SHA-256 and truncated to 32 hex chars; if the
    skill root is not actually under *corpus_root* the directory name is used
    as the fallback basis instead.

    A pure path/hash helper with no I/O. Called by
    ``classifiers.ingest_skills`` (to key each upserted row) and exercised by
    ``tests/test_skill_catalog.py``.

    Args:
        skill_root: Directory containing the skill (the dir that holds its
            ``SKILL.md``).
        corpus_root: Root of the skills corpus that *skill_root* lives under.

    Returns:
        A 32-character lowercase hex string usable as the skill's primary key.
    """
    try:
        rel = skill_root.resolve().relative_to(corpus_root.resolve())
    except ValueError:
        rel = skill_root.name
    rel_s = rel.as_posix()
    return hashlib.sha256(rel_s.encode("utf-8")).hexdigest()[:32]




[docs]
def canonical_skill_sort_key(skill_dir: Path, corpus_root: Path) -> tuple[int, str]:
    """Rank a skill directory so canonical sources win when deduping.

    Returns a sort key that establishes source precedence among otherwise
    identical skills: git-cloned corpora under ``repos`` sort first (bucket 0),
    ``npx``-installed copies next (bucket 1), everything else after (bucket 2),
    and anything outside *corpus_root* last (bucket 99). When several
    directories share the same ``body_hash``, sorting by this key and keeping
    the first ensures the git-managed copy is treated as canonical rather than
    a transient ``npx`` install.

    A pure path-classification helper with no I/O. Called by
    ``classifiers.ingest_skills`` as the ``key`` when sorting discovered skill
    directories before deduplication. No other in-repo callers were found.

    Args:
        skill_dir: Directory of the skill being ranked.
        corpus_root: Root of the corpus used to compute the relative path and
            identify the ``repos`` / ``npx`` top-level bucket.

    Returns:
        A ``(bucket, relative_posix_path)`` tuple; lower buckets and
        lexicographically smaller paths sort first.
    """
    try:
        rel = skill_dir.resolve().relative_to(corpus_root.resolve())
    except ValueError:
        return (99, skill_dir.as_posix())
    parts = rel.parts
    bucket = 2
    if len(parts) >= 1 and parts[0] == "repos":
        bucket = 0
    elif len(parts) >= 1 and parts[0] == "npx":
        bucket = 1
    return (bucket, rel.as_posix())




[docs]
def discover_skill_dirs(
    root: Path,
    *,
    max_depth: int = _MAX_SCAN_DEPTH,
) -> list[Path]:
    """Find every directory under *root* that directly holds a SKILL.md.

    Entry point for skill discovery: it resolves *root* and runs a
    depth-bounded recursive walk (see the inner ``walk`` closure) that collects
    each directory containing a ``SKILL.md`` and prunes that branch, since
    skills are not nested. Directories named in ``_SKIP_DIR_NAMES`` and hidden
    dot-directories (except the allow-listed ``.agents``) are ignored so VCS,
    cache, and dependency trees are not scanned.

    Touches the filesystem (directory iteration only); unreadable directories
    are silently skipped. Called by ``classifiers.ingest_skills`` to enumerate
    the corpus and exercised by ``tests/test_skill_catalog.py``.

    Args:
        root: Directory to scan recursively for skills.
        max_depth: Maximum recursion depth below *root* (default
            ``_MAX_SCAN_DEPTH``); deeper directories are not visited.

    Returns:
        The list of directories that directly contain a ``SKILL.md`` file.
    """
    found: list[Path] = []
    root = root.resolve()

    def walk(dir_path: Path, depth: int) -> None:
        """Recursively collect directories that directly contain a ``SKILL.md``.

        Depth-bounded (``max_depth``) walk used by :func:`discover_skill_dirs`: when a
        directory holds a ``SKILL.md`` it is appended to the enclosing ``found`` list
        and the branch is pruned (skills are not nested); otherwise each non-skipped
        subdirectory is recursed into. Skipped names and unreadable directories are
        ignored silently. Mutates the closed-over ``found`` list and returns nothing.

        Args:
            dir_path: Directory currently being scanned.
            depth: Current recursion depth; recursion stops past ``max_depth``.
        """
        if depth > max_depth:
            return
        try:
            entries = list(dir_path.iterdir())
        except OSError:
            return
        skill_file = dir_path / "SKILL.md"
        if skill_file.is_file():
            found.append(dir_path)
            return
        for entry in entries:
            if not entry.is_dir():
                continue
            if entry.name in _SKIP_DIR_NAMES:
                continue
            if entry.name.startswith(".") and entry.name not in _ALLOW_DOT_DIR_NAMES:
                continue
            walk(entry, depth + 1)

    if root.is_dir():
        walk(root, 0)
    return found




[docs]
def init_db(db_path: Path) -> None:
    """Create the SQLite skills table (and its parent dir) if absent.

    Idempotent schema bootstrap for the skill catalog database. It ensures the
    parent directory exists, opens *db_path*, and runs a ``CREATE TABLE IF NOT
    EXISTS skills`` so subsequent :func:`upsert_skill` and ``load_*`` calls have
    a table to work against. Safe to call repeatedly.

    Touches the filesystem and the SQLite database (creates directories,
    connects, commits DDL, then closes the connection); no other I/O. Called by
    ``classifiers.ingest_skills`` before populating the catalog and by
    ``tests/test_skill_catalog.py``.

    Args:
        db_path: Path to the SQLite database file to initialise.
    """
    db_path.parent.mkdir(parents=True, exist_ok=True)
    conn = sqlite3.connect(str(db_path))
    try:
        conn.execute(
            """
            CREATE TABLE IF NOT EXISTS skills (
                skill_id TEXT PRIMARY KEY,
                name TEXT NOT NULL,
                description TEXT NOT NULL,
                skill_md_path TEXT NOT NULL,
                skill_root TEXT NOT NULL,
                corpus_root TEXT NOT NULL,
                body_hash TEXT NOT NULL,
                ingested_at REAL NOT NULL
            )
            """,
        )
        conn.commit()
    finally:
        conn.close()




[docs]
def upsert_skill(
    db_path: Path,
    row: dict[str, Any],
) -> None:
    """Insert or replace a single skill row in the catalog database.

    Upserts one skill keyed by ``skill_id`` using ``INSERT OR REPLACE`` so a
    re-ingested skill overwrites its prior row rather than duplicating it. The
    ``ingested_at`` timestamp defaults to ``time.time()`` when the caller does
    not supply one, recording when the row was last written.

    Touches the SQLite database (connects, executes the upsert, commits, and
    closes); assumes the ``skills`` table already exists via :func:`init_db`.
    Called by ``classifiers.ingest_skills`` for each discovered skill. No other
    in-repo callers were found.

    Args:
        db_path: Path to the SQLite catalog database.
        row: Mapping with ``skill_id``, ``name``, ``description``,
            ``skill_md_path``, ``skill_root``, ``corpus_root``, and
            ``body_hash`` keys, plus an optional ``ingested_at`` epoch float.
    """
    conn = sqlite3.connect(str(db_path))
    try:
        conn.execute(
            """
            INSERT OR REPLACE INTO skills (
                skill_id, name, description, skill_md_path, skill_root,
                corpus_root, body_hash, ingested_at
            ) VALUES (?, ?, ?, ?, ?, ?, ?, ?)
            """,
            (
                row["skill_id"],
                row["name"],
                row["description"],
                row["skill_md_path"],
                row["skill_root"],
                row["corpus_root"],
                row["body_hash"],
                row.get("ingested_at", time.time()),
            ),
        )
        conn.commit()
    finally:
        conn.close()




[docs]
def load_skill_by_id(db_path: Path, skill_id: str) -> dict[str, Any] | None:
    """Load a single skill's catalog row by its ID.

    Point lookup against the ``skills`` table used to resolve a skill's
    metadata (including its ``skill_md_path`` on disk) from the stable ID. A
    missing database file or a missing row both yield ``None`` rather than
    raising, so callers can treat "unknown skill" uniformly.

    Touches the SQLite database (opens read-only via a ``SELECT``, then closes).
    Called by the ``activate_skill`` tool to fetch the row before reading the
    skill body, and exercised by ``tests/test_skill_catalog.py``.

    Args:
        db_path: Path to the SQLite catalog database.
        skill_id: Stable skill identifier (see :func:`stable_skill_id`).

    Returns:
        A dict of the skill's columns (without ``ingested_at``), or ``None`` if
        the database file or the row does not exist.
    """
    if not db_path.is_file():
        return None
    conn = sqlite3.connect(str(db_path))
    try:
        cur = conn.execute(
            "SELECT skill_id, name, description, skill_md_path, skill_root, "
            "corpus_root, body_hash FROM skills WHERE skill_id = ?",
            (skill_id,),
        )
        r = cur.fetchone()
        if not r:
            return None
        return {
            "skill_id": r[0],
            "name": r[1],
            "description": r[2],
            "skill_md_path": r[3],
            "skill_root": r[4],
            "corpus_root": r[5],
            "body_hash": r[6],
        }
    finally:
        conn.close()




[docs]
def load_all_skills(db_path: Path) -> list[dict[str, Any]]:
    """Return every skill's metadata row from the catalog database.

    Full-table scan of ``skills`` used wherever the whole catalog is needed:
    counting ingested skills, building embeddings over all skills, and
    end-to-end verification. A missing database file yields an empty list
    rather than raising.

    Touches the SQLite database (opens, runs a ``SELECT`` of all rows, closes).
    Called by ``classifiers.ingest_skills`` and
    ``classifiers.update_skill_embeddings`` (to embed skills), by
    ``scripts/skills_corpus_pipeline.py`` and ``scripts/verify_npx_skills_e2e.py``,
    and by ``tests/test_skill_catalog.py``.

    Args:
        db_path: Path to the SQLite catalog database.

    Returns:
        A list of per-skill dicts (each without ``ingested_at``); empty if the
        database file does not exist.
    """
    if not db_path.is_file():
        return []
    conn = sqlite3.connect(str(db_path))
    try:
        cur = conn.execute(
            "SELECT skill_id, name, description, skill_md_path, skill_root, "
            "corpus_root, body_hash FROM skills",
        )
        rows = []
        for r in cur.fetchall():
            rows.append(
                {
                    "skill_id": r[0],
                    "name": r[1],
                    "description": r[2],
                    "skill_md_path": r[3],
                    "skill_root": r[4],
                    "corpus_root": r[5],
                    "body_hash": r[6],
                }
            )
        return rows
    finally:
        conn.close()




[docs]
def read_skill_body(skill_md_path: Path) -> tuple[str, str]:
    """Read a SKILL.md and split its markdown body from the raw text.

    Loads the file and strips the leading ``---`` fenced YAML frontmatter,
    returning both the body alone (for presenting/activating the skill) and the
    untouched full text (for callers that still need the frontmatter). When no
    frontmatter is present the whole file is treated as the body.

    Touches the filesystem (reads *skill_md_path*); no other I/O. Called by the
    ``activate_skill`` tool when surfacing a skill's instructions. No other
    in-repo callers were found.

    Args:
        skill_md_path: Path to the ``SKILL.md`` file to read.

    Returns:
        A ``(body, raw)`` tuple: the markdown body with frontmatter removed and
        whitespace-stripped, and the original unmodified file contents.
    """
    raw = skill_md_path.read_text(encoding="utf-8", errors="replace")
    m = re.match(
        r"^---\s*\n.*?\n---\s*\n(.*)$",
        raw,
        re.DOTALL,
    )
    if m:
        return m.group(1).strip(), raw
    return raw.strip(), raw




[docs]
def skill_embedding_text(name: str, description: str) -> str:
    """Build the text representation of a skill for semantic embedding.

    Joins a skill's name and description into the single string that gets
    embedded for vector search, so a query can be matched against skills by
    meaning. Keeping this in one helper ensures ingestion and lookup embed
    skills identically (tier-1 style: name then description, newline-separated).

    A pure string helper with no I/O. Called by
    ``classifiers.update_skill_embeddings`` when computing the embedding for
    each skill row. No other in-repo callers were found.

    Args:
        name: The skill's name.
        description: The skill's description.

    Returns:
        The ``"name\\ndescription"`` string fed to the embedding model.
    """
    return f"{name}\n{description}"