Source code for jsonutil

"""JSON helpers backed by orjson; falls back to stdlib ``json`` when required.

Drop-in replacement for the stdlib :mod:`json` module used across every
Stargazer service -- callers do ``import jsonutil as json`` and then use the
familiar :func:`dumps`, :func:`loads`, :func:`dump`, and :func:`load` names. The
fast :mod:`orjson` path is taken whenever the requested options fall within its
capabilities; otherwise the call transparently routes to the stdlib serializer
(decided by :func:`_needs_stdlib_dumps` and :func:`_option_bits`) so behaviour
stays compatible. ``loads`` likewise retries on the stdlib decoder for inputs
orjson rejects (e.g. lone surrogates).

These functions perform no I/O of their own beyond what the caller's file handle
does; they encode and decode in memory and are exercised pervasively for Redis
payloads, tool arguments, cached blobs, and LLM/JSON responses throughout the
repo. ``JSONDecodeError`` is aliased to the stdlib exception so that a single
``except json.JSONDecodeError`` catches failures from either backend.
"""

from __future__ import annotations

import json as _stdlib_json
from typing import IO, Any, Callable

import orjson

# Base class so ``except json.JSONDecodeError`` catches orjson and stdlib errors.
JSONDecodeError = _stdlib_json.JSONDecodeError


def _option_bits(*, indent: int | None, sort_keys: bool) -> int:
    """Translate stdlib-style serialization flags into an orjson option bitmask.

    Maps the ``indent`` and ``sort_keys`` arguments onto the corresponding
    ``orjson.OPT_*`` constants so the fast orjson path can honour them. Only the
    indents orjson natively supports are mapped; ``indent == 4`` falls back to
    ``OPT_INDENT_2`` when the running orjson lacks ``OPT_INDENT_4``.

    This is called by :func:`dumps` (its only caller) when assembling the
    ``option`` keyword for ``orjson.dumps``; it performs no I/O or other side
    effects.

    Args:
        indent: Indentation width (``2`` or ``4``) or ``None`` for compact
            output. Only ``2`` and ``4`` are recognised here; other values
            contribute no bits (the stdlib path handles them upstream).
        sort_keys: When ``True``, add ``OPT_SORT_KEYS`` so object keys are
            emitted in sorted order.

    Returns:
        int: A bitwise-OR of the selected ``orjson.OPT_*`` flags (``0`` when
        none apply), suitable for the ``option`` argument of ``orjson.dumps``.
    """
    opts = 0
    if sort_keys:
        opts |= orjson.OPT_SORT_KEYS
    if indent == 2:
        opts |= orjson.OPT_INDENT_2
    elif indent == 4:
        opts |= getattr(orjson, "OPT_INDENT_4", orjson.OPT_INDENT_2)
    return opts


def _needs_stdlib_dumps(
    *,
    skipkeys: bool,
    ensure_ascii: bool,
    check_circular: bool,
    allow_nan: bool,
    cls: type | None,
    indent: int | str | None,
    separators: tuple[str, str] | None,
) -> bool:
    """Decide whether a ``dumps`` call must fall back to the stdlib serializer.

    orjson is fast but supports only a subset of stdlib ``json.dumps`` behaviour.
    This predicate inspects the requested options and returns ``True`` whenever
    any of them lies outside orjson's capabilities -- e.g. ``skipkeys``, a custom
    encoder ``cls``, disabled ``check_circular``, ``allow_nan`` semantics
    differences, ``ensure_ascii`` escaping, string/unsupported integer indents,
    or non-default ``separators`` -- so the caller can route to
    ``_stdlib_json.dumps`` instead.

    This is called only by :func:`dumps`, which uses the result to choose its
    serialization backend; it performs no I/O or other side effects.

    Args:
        skipkeys: Stdlib flag to skip non-basic dict keys (unsupported by orjson).
        ensure_ascii: When ``True``, non-ASCII must be escaped (orjson always
            emits UTF-8, so this forces the stdlib path).
        check_circular: Stdlib circular-reference check; disabling it forces the
            stdlib path.
        allow_nan: When ``False``, NaN/Infinity must raise rather than serialize,
            forcing the stdlib path.
        cls: A custom ``JSONEncoder`` subclass; any value forces the stdlib path.
        indent: Indentation spec. String indents, and integer indents other than
            ``2`` or ``4``, force the stdlib path.
        separators: Item/key separators; anything other than ``None`` or the
            compact ``(",", ":")`` forces the stdlib path.

    Returns:
        bool: ``True`` if the stdlib serializer is required, ``False`` if the
        orjson fast path can satisfy the request.
    """
    if (
        skipkeys
        or cls is not None
        or not check_circular
        or not allow_nan
        or ensure_ascii
    ):
        return True
    if isinstance(indent, str):
        return True
    if isinstance(indent, int) and indent not in (2, 4):
        return True
    if separators is not None and separators != (",", ":"):
        return True
    return False



[docs]
def dumps(
    obj: Any,
    *,
    skipkeys: bool = False,
    ensure_ascii: bool = False,
    check_circular: bool = True,
    allow_nan: bool = True,
    cls: type | None = None,
    indent: int | str | None = None,
    separators: tuple[str, str] | None = None,
    default: Callable[..., Any] | None = None,
    sort_keys: bool = False,
) -> str:
    """Serialize *obj* to a JSON string (UTF-8 text, like ``ensure_ascii=False``).

    The codebase's drop-in for ``json.dumps`` (imported as ``json``). It first
    asks :func:`_needs_stdlib_dumps` whether the requested formatting options
    exceed orjson's feature set; when they do it delegates to
    ``_stdlib_json.dumps`` with every argument forwarded unchanged, and otherwise
    takes the fast ``orjson.dumps`` path with options translated by
    :func:`_option_bits`. As a final safety net, a ``TypeError`` from orjson
    (notably lone surrogate code points in string values, which orjson refuses
    but stdlib accepts) is caught and retried on the stdlib serializer with
    ``ensure_ascii=False``. Unlike the stdlib default, ``ensure_ascii`` defaults
    to ``False`` here so non-ASCII text is emitted as UTF-8 rather than escaped.

    Pure in-memory serialization with no I/O. Called directly by :func:`dump`
    (which writes the result to a file) and, via the ``import jsonutil as json``
    alias, by essentially every module that serializes data for Redis, tool
    payloads, prompt overlays, or persisted state.

    Args:
        obj: The Python object to serialize.
        skipkeys: Skip non-basic dict keys (forces the stdlib path).
        ensure_ascii: Escape non-ASCII when ``True`` (forces the stdlib path);
            defaults to ``False`` so output is UTF-8.
        check_circular: Enable circular-reference detection; disabling it forces
            the stdlib path.
        allow_nan: Permit ``NaN``/``Infinity`` output; ``False`` forces the
            stdlib path.
        cls: Optional custom ``JSONEncoder`` subclass; any value forces the
            stdlib path.
        indent: Indentation spec. Integer ``2`` or ``4`` use orjson's native
            indent; string indents and other integers force the stdlib path;
            ``None`` produces compact output.
        separators: Item/key separator pair; anything other than ``None`` or the
            compact ``(",", ":")`` forces the stdlib path.
        default: Fallback callable invoked for otherwise-unserializable values.
        sort_keys: Emit object keys in sorted order.

    Returns:
        str: The serialized JSON text.

    Raises:
        TypeError: Propagated from the stdlib serializer for objects that remain
            unserializable (e.g. when no suitable ``default`` is supplied).
    """
    if _needs_stdlib_dumps(
        skipkeys=skipkeys,
        ensure_ascii=ensure_ascii,
        check_circular=check_circular,
        allow_nan=allow_nan,
        cls=cls,
        indent=indent,
        separators=separators,
    ):
        return _stdlib_json.dumps(
            obj,
            skipkeys=skipkeys,
            ensure_ascii=ensure_ascii,
            check_circular=check_circular,
            allow_nan=allow_nan,
            cls=cls,
            indent=indent,
            separators=separators,
            default=default,
            sort_keys=sort_keys,
        )
    opts = _option_bits(
        indent=indent if isinstance(indent, int) else None, sort_keys=sort_keys
    )
    kw: dict[str, Any] = {"option": opts}
    if default is not None:
        kw["default"] = default
    try:
        return orjson.dumps(obj, **kw).decode("utf-8")
    except TypeError:
        # e.g. lone surrogates in str values — stdlib json accepts these
        return _stdlib_json.dumps(
            obj,
            indent=indent if isinstance(indent, int) else None,
            separators=separators,
            default=default,
            sort_keys=sort_keys,
            ensure_ascii=False,
        )




[docs]
def loads(s: str | bytes | bytearray | memoryview) -> Any:
    """Deserialize a JSON document from text or bytes into a Python object.

    Parses with the fast ``orjson.loads`` first and, if that raises a
    ``JSONDecodeError`` (notably for inputs containing lone surrogates that
    orjson rejects), retries with the more permissive stdlib ``json.loads``,
    decoding bytes with ``errors="surrogatepass"`` so such characters survive.

    This is the drop-in replacement for ``json.loads`` used throughout the
    codebase, which imports this module as ``json`` (e.g. ``import jsonutil as
    json``). :func:`load` calls it after reading a file's contents, and many
    callers across services use it to decode Redis payloads, tool arguments,
    cached blobs, and LLM/JSON responses. It performs no I/O of its own.

    Args:
        s: The JSON document as ``str``, ``bytes``, ``bytearray``, or
            ``memoryview`` (a ``memoryview`` is materialized to ``bytes`` only on
            the stdlib fallback).

    Returns:
        Any: The decoded Python object (dict, list, str, int, float, bool, or
        ``None``).

    Raises:
        JSONDecodeError: If the input is not valid JSON for either backend.
    """
    try:
        return orjson.loads(s)
    except JSONDecodeError:
        if isinstance(s, memoryview):
            s = s.tobytes()
        if isinstance(s, (bytes, bytearray)):
            text = s.decode("utf-8", errors="surrogatepass")
        else:
            text = s
        return _stdlib_json.loads(text)




[docs]
def load(fp: IO[str]) -> Any:
    """Read a file-like object and deserialize its full contents as JSON.

    Reads the entire stream via ``fp.read()`` and delegates parsing to
    :func:`loads`, inheriting its orjson-first/stdlib-fallback behaviour. This
    is the drop-in ``json.load`` for the codebase (imported as ``json``).

    It calls :func:`loads` and triggers I/O through ``fp.read()``. Callers open a
    file and pass the handle -- e.g. ``threadweave.py`` loads persisted state,
    and various parsers/classifier index loaders (``scripts/...``,
    ``classifiers/...``) read JSON fixtures and index files this way.

    Args:
        fp: An open, readable text file-like object positioned at the start of a
            JSON document.

    Returns:
        Any: The decoded Python object.

    Raises:
        JSONDecodeError: If the file contents are not valid JSON.
    """
    return loads(fp.read())




[docs]
def dump(
    obj: Any,
    fp: IO[str],
    *,
    skipkeys: bool = False,
    ensure_ascii: bool = False,
    check_circular: bool = True,
    allow_nan: bool = True,
    cls: type | None = None,
    indent: int | str | None = None,
    separators: tuple[str, str] | None = None,
    default: Callable[..., Any] | None = None,
    sort_keys: bool = False,
) -> None:
    """Serialize *obj* to JSON and write the result to a file-like object.

    Produces the JSON text via :func:`dumps` (forwarding every stdlib-style
    formatting option unchanged) and writes it in a single ``fp.write`` call.
    Because :func:`dumps` decides between the orjson fast path and the stdlib
    fallback, the same option compatibility rules apply here. This is the
    drop-in ``json.dump`` for the codebase (imported as ``json``).

    It calls :func:`dumps` to build the payload and performs I/O through
    ``fp.write``. Callers open a file for writing and pass the handle -- e.g.
    ``threadweave.py`` persists state with ``indent=2``, ``memories_port`` export
    scripts dump memory exports, and classifier/tool index builders write index
    files this way.

    Args:
        obj: The Python object to serialize.
        fp: An open, writable text file-like object to receive the JSON text.
        skipkeys: Forwarded to :func:`dumps`; skip non-basic dict keys.
        ensure_ascii: Forwarded to :func:`dumps`; escape non-ASCII when ``True``.
        check_circular: Forwarded to :func:`dumps`; enable circular-reference
            checking.
        allow_nan: Forwarded to :func:`dumps`; permit NaN/Infinity output.
        cls: Forwarded to :func:`dumps`; optional custom ``JSONEncoder`` class.
        indent: Forwarded to :func:`dumps`; indentation spec for pretty output.
        separators: Forwarded to :func:`dumps`; item/key separator pair.
        default: Forwarded to :func:`dumps`; fallback serializer for
            otherwise-unserializable values.
        sort_keys: Forwarded to :func:`dumps`; emit object keys in sorted order.

    Raises:
        TypeError: Propagated from :func:`dumps` for unserializable objects on
            the stdlib path.
    """
    fp.write(
        dumps(
            obj,
            skipkeys=skipkeys,
            ensure_ascii=ensure_ascii,
            check_circular=check_circular,
            allow_nan=allow_nan,
            cls=cls,
            indent=indent,
            separators=separators,
            default=default,
            sort_keys=sort_keys,
        ),
    )