Source code for classifiers.ingest_skills

#!/usr/bin/env python3
"""Scan corpus roots for SKILL.md files and populate the SQLite skills index.

CLI and helper for the skills-corpus ingest stage: it walks one or more corpus
roots, discovers every ``SKILL.md`` under them, and upserts the parsed metadata
into the on-disk SQLite skills index that the runtime skill catalog reads from.
Optional body-hash dedupe keeps copies of the same skill from being indexed more
than once.

Heavy lifting lives in :mod:`classifiers.skill_catalog` (discovery, parsing,
stable id generation, and the database upserts); this module wraps it with the
:func:`ingest_roots` driver and a :func:`main` argument-parsing entry point. The
module is runnable as ``python -m classifiers.ingest_skills`` and is also invoked
as a subprocess by the skills-corpus reconcile and pipeline scripts.
"""

from __future__ import annotations

import argparse
import logging
import os
import sys
import time
from pathlib import Path

sys.path.insert(
    0,
    os.path.abspath(os.path.join(os.path.dirname(__file__), "..")),
)

from classifiers.skill_catalog import (  # noqa: E402
    canonical_skill_sort_key,
    discover_skill_dirs,
    init_db,
    load_all_skills,
    stable_skill_id,
    upsert_skill,
    _parse_skill_md,
)

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s",
)
logger = logging.getLogger(__name__)


[docs] def ingest_roots( corpus_roots: list[Path], db_path: Path, *, dedupe_by_body_hash: bool = True, ) -> tuple[int, int]: """Scan the given corpus roots and upsert every discovered skill into SQLite. The core ingest driver: for each resolved corpus root it discovers skill directories, parses each ``SKILL.md``, optionally drops body-hash duplicates, assigns a stable skill id, and writes one row per surviving skill into the SQLite index. This is what keeps the searchable skill catalog in sync with the on-disk corpus. It ensures the schema exists via :func:`classifiers.skill_catalog.init_db`, enumerates directories with :func:`classifiers.skill_catalog.discover_skill_dirs` sorted by :func:`classifiers.skill_catalog.canonical_skill_sort_key`, parses files with ``classifiers.skill_catalog._parse_skill_md``, derives ids via :func:`classifiers.skill_catalog.stable_skill_id`, and persists rows through :func:`classifiers.skill_catalog.upsert_skill`. Its side effects are the SQLite writes at ``db_path`` and INFO/WARNING logging; missing roots and unparseable or duplicate files are counted as skips, not errors. Called by :func:`main`, by ``scripts/skills_corpus_pipeline.py``, and by ``tests/test_skill_catalog.py``; no other callers were found. Args: corpus_roots: Directories to scan, each possibly containing nested skills. db_path: Path to the SQLite skills index to create and write. dedupe_by_body_hash: When ``True`` (default), the first skill seen for a given body hash wins and later identical bodies are skipped. Returns: tuple[int, int]: ``(inserted_or_updated, skipped)`` counts across all roots. """ init_db(db_path) ok = 0 skipped = 0 for corpus_root in corpus_roots: corpus_root = corpus_root.resolve() if not corpus_root.is_dir(): logger.warning("Corpus root missing: %s", corpus_root) continue skill_dirs = discover_skill_dirs(corpus_root) skill_dirs.sort(key=lambda p: canonical_skill_sort_key(p, corpus_root)) logger.info( "Found %d skill directories under %s", len(skill_dirs), corpus_root, ) seen_body_hashes: set[str] = set() for skill_dir in skill_dirs: skill_md = skill_dir / "SKILL.md" parsed = _parse_skill_md(skill_md) if not parsed: skipped += 1 continue bh = parsed["body_hash"] if dedupe_by_body_hash and bh in seen_body_hashes: logger.info( "Dedupe: skipping duplicate body_hash under %s", skill_dir, ) skipped += 1 continue if dedupe_by_body_hash: seen_body_hashes.add(bh) sid = stable_skill_id(skill_dir, corpus_root) row = { "skill_id": sid, "name": parsed["name"], "description": parsed["description"], "skill_md_path": str(skill_md.resolve()), "skill_root": str(skill_dir.resolve()), "corpus_root": str(corpus_root), "body_hash": bh, "ingested_at": time.time(), } upsert_skill(db_path, row) ok += 1 return ok, skipped
[docs] def main() -> int: """Run the SKILL.md ingestion as a standalone CLI entry point. Parses command-line arguments (``--roots``, ``--db``, ``--no-dedupe-body-hash``), resolves the corpus roots to scan, ingests every discovered ``SKILL.md`` into the SQLite skills index, and logs a summary of how many rows were upserted, skipped, and now present. When no ``--roots`` are supplied it falls back to the configured corpus roots by importing ``config.Config`` and reading ``Config.load().skills_corpus_roots``; if that yields nothing it logs an error and aborts. The actual scan and database writes are delegated to :func:`ingest_roots`, and the final total is computed via ``classifiers.skill_catalog.load_all_skills``. Called by the module's ``__main__`` guard (``raise SystemExit(main())``), so it is the process entry point when run as ``python -m classifiers.ingest_skills`` — for example the subprocess spawned by ``scripts/reconcile_skills_sqlite.py``. (No in-process Python callers invoke ``main`` directly; the pipeline in ``scripts/skills_corpus_pipeline.py`` calls :func:`ingest_roots` instead.) Returns: int: ``0`` on successful ingestion, or ``1`` when no corpus roots could be resolved from arguments or config. """ parser = argparse.ArgumentParser( description="Ingest Agent Skills from corpus directories into SQLite.", ) parser.add_argument( "--roots", nargs="*", default=[], help="Directories to scan (each may contain nested skills).", ) parser.add_argument( "--db", default="data/skills_index.db", help="SQLite database path", ) parser.add_argument( "--no-dedupe-body-hash", action="store_true", help="Store every SKILL.md even when body matches another skill (default: dedupe)", ) args = parser.parse_args() roots = [Path(p) for p in args.roots] if args.roots else [] if not roots: from config import Config cfg = Config.load() roots = [Path(p) for p in cfg.skills_corpus_roots] if not roots: logger.error( "No corpus roots: pass --roots or set skills.corpus_roots in config" ) return 1 db_path = Path(args.db) n_ok, n_skip = ingest_roots( roots, db_path, dedupe_by_body_hash=not args.no_dedupe_body_hash, ) total = len(load_all_skills(db_path)) logger.info( "Ingest complete: upserted=%d skipped=%d total_in_db=%d", n_ok, n_skip, total, ) return 0
if __name__ == "__main__": raise SystemExit(main())