#!/usr/bin/env python3
"""Scan corpus roots for SKILL.md files and populate the SQLite skills index.
CLI and helper for the skills-corpus ingest stage: it walks one or more corpus
roots, discovers every ``SKILL.md`` under them, and upserts the parsed metadata
into the on-disk SQLite skills index that the runtime skill catalog reads from.
Optional body-hash dedupe keeps copies of the same skill from being indexed more
than once.
Heavy lifting lives in :mod:`classifiers.skill_catalog` (discovery, parsing,
stable id generation, and the database upserts); this module wraps it with the
:func:`ingest_roots` driver and a :func:`main` argument-parsing entry point. The
module is runnable as ``python -m classifiers.ingest_skills`` and is also invoked
as a subprocess by the skills-corpus reconcile and pipeline scripts.
"""
from __future__ import annotations
import argparse
import logging
import os
import sys
import time
from pathlib import Path
sys.path.insert(
0,
os.path.abspath(os.path.join(os.path.dirname(__file__), "..")),
)
from classifiers.skill_catalog import ( # noqa: E402
canonical_skill_sort_key,
discover_skill_dirs,
init_db,
load_all_skills,
stable_skill_id,
upsert_skill,
_parse_skill_md,
)
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(levelname)s - %(message)s",
)
logger = logging.getLogger(__name__)
[docs]
def ingest_roots(
corpus_roots: list[Path],
db_path: Path,
*,
dedupe_by_body_hash: bool = True,
) -> tuple[int, int]:
"""Scan the given corpus roots and upsert every discovered skill into SQLite.
The core ingest driver: for each resolved corpus root it discovers skill
directories, parses each ``SKILL.md``, optionally drops body-hash duplicates,
assigns a stable skill id, and writes one row per surviving skill into the
SQLite index. This is what keeps the searchable skill catalog in sync with the
on-disk corpus.
It ensures the schema exists via
:func:`classifiers.skill_catalog.init_db`, enumerates directories with
:func:`classifiers.skill_catalog.discover_skill_dirs` sorted by
:func:`classifiers.skill_catalog.canonical_skill_sort_key`, parses files with
``classifiers.skill_catalog._parse_skill_md``, derives ids via
:func:`classifiers.skill_catalog.stable_skill_id`, and persists rows through
:func:`classifiers.skill_catalog.upsert_skill`. Its side effects are the
SQLite writes at ``db_path`` and INFO/WARNING logging; missing roots and
unparseable or duplicate files are counted as skips, not errors. Called by
:func:`main`, by ``scripts/skills_corpus_pipeline.py``, and by
``tests/test_skill_catalog.py``; no other callers were found.
Args:
corpus_roots: Directories to scan, each possibly containing nested skills.
db_path: Path to the SQLite skills index to create and write.
dedupe_by_body_hash: When ``True`` (default), the first skill seen for a
given body hash wins and later identical bodies are skipped.
Returns:
tuple[int, int]: ``(inserted_or_updated, skipped)`` counts across all
roots.
"""
init_db(db_path)
ok = 0
skipped = 0
for corpus_root in corpus_roots:
corpus_root = corpus_root.resolve()
if not corpus_root.is_dir():
logger.warning("Corpus root missing: %s", corpus_root)
continue
skill_dirs = discover_skill_dirs(corpus_root)
skill_dirs.sort(key=lambda p: canonical_skill_sort_key(p, corpus_root))
logger.info(
"Found %d skill directories under %s",
len(skill_dirs),
corpus_root,
)
seen_body_hashes: set[str] = set()
for skill_dir in skill_dirs:
skill_md = skill_dir / "SKILL.md"
parsed = _parse_skill_md(skill_md)
if not parsed:
skipped += 1
continue
bh = parsed["body_hash"]
if dedupe_by_body_hash and bh in seen_body_hashes:
logger.info(
"Dedupe: skipping duplicate body_hash under %s",
skill_dir,
)
skipped += 1
continue
if dedupe_by_body_hash:
seen_body_hashes.add(bh)
sid = stable_skill_id(skill_dir, corpus_root)
row = {
"skill_id": sid,
"name": parsed["name"],
"description": parsed["description"],
"skill_md_path": str(skill_md.resolve()),
"skill_root": str(skill_dir.resolve()),
"corpus_root": str(corpus_root),
"body_hash": bh,
"ingested_at": time.time(),
}
upsert_skill(db_path, row)
ok += 1
return ok, skipped
[docs]
def main() -> int:
"""Run the SKILL.md ingestion as a standalone CLI entry point.
Parses command-line arguments (``--roots``, ``--db``,
``--no-dedupe-body-hash``), resolves the corpus roots to scan, ingests
every discovered ``SKILL.md`` into the SQLite skills index, and logs a
summary of how many rows were upserted, skipped, and now present.
When no ``--roots`` are supplied it falls back to the configured corpus
roots by importing ``config.Config`` and reading
``Config.load().skills_corpus_roots``; if that yields nothing it logs an
error and aborts. The actual scan and database writes are delegated to
:func:`ingest_roots`, and the final total is computed via
``classifiers.skill_catalog.load_all_skills``.
Called by the module's ``__main__`` guard (``raise SystemExit(main())``),
so it is the process entry point when run as ``python -m
classifiers.ingest_skills`` — for example the subprocess spawned by
``scripts/reconcile_skills_sqlite.py``. (No in-process Python callers
invoke ``main`` directly; the pipeline in
``scripts/skills_corpus_pipeline.py`` calls :func:`ingest_roots` instead.)
Returns:
int: ``0`` on successful ingestion, or ``1`` when no corpus roots
could be resolved from arguments or config.
"""
parser = argparse.ArgumentParser(
description="Ingest Agent Skills from corpus directories into SQLite.",
)
parser.add_argument(
"--roots",
nargs="*",
default=[],
help="Directories to scan (each may contain nested skills).",
)
parser.add_argument(
"--db",
default="data/skills_index.db",
help="SQLite database path",
)
parser.add_argument(
"--no-dedupe-body-hash",
action="store_true",
help="Store every SKILL.md even when body matches another skill (default: dedupe)",
)
args = parser.parse_args()
roots = [Path(p) for p in args.roots] if args.roots else []
if not roots:
from config import Config
cfg = Config.load()
roots = [Path(p) for p in cfg.skills_corpus_roots]
if not roots:
logger.error(
"No corpus roots: pass --roots or set skills.corpus_roots in config"
)
return 1
db_path = Path(args.db)
n_ok, n_skip = ingest_roots(
roots,
db_path,
dedupe_by_body_hash=not args.no_dedupe_body_hash,
)
total = len(load_all_skills(db_path))
logger.info(
"Ingest complete: upserted=%d skipped=%d total_in_db=%d",
n_ok,
n_skip,
total,
)
return 0
if __name__ == "__main__":
raise SystemExit(main())