Source code for tools.query_arxiv

"""
Tool: query_arxiv
Search the arXiv database for scientific papers and return their abstracts, authors, and URLs.
"""

from __future__ import annotations

import asyncio
import jsonutil as json
import logging
import xml.etree.ElementTree as ET
from urllib.parse import quote

import httpx

logger = logging.getLogger(__name__)

TOOL_NAME = "query_arxiv"
TOOL_DESCRIPTION = "Search the arXiv database for scientific papers and return their abstracts, authors, and URLs."
TOOL_PARAMETERS = {
    "type": "object",
    "properties": {
        "query": {
            "type": "string",
            "description": "The search query (e.g. 'quantum entanglement' or 'au:feynman').",
        },
        "max_results": {
            "type": "integer",
            "description": "Maximum number of results to return (default: 3, max 10).",
        },
    },
    "required": ["query"],
}


def _parse_arxiv_atom(xml_data: bytes) -> list[dict]:
    """Parse an arXiv Atom feed into a list of normalized paper records.

    Parses the raw Atom XML returned by the arXiv export API using
    :mod:`xml.etree.ElementTree`, iterating over each ``atom:entry`` and
    pulling out the title, summary/abstract, canonical id URL, and the list
    of author names. Title and summary text are stripped and have embedded
    newlines flattened to spaces; missing fields degrade gracefully to
    ``"No Title"``, ``"No Summary"``, or ``"No Link"`` rather than raising.
    This is CPU-bound, synchronous, and has no I/O or other side effects --
    :func:`run` offloads it to a worker thread via
    :func:`asyncio.to_thread` so it does not block the event loop.

    This is called by :func:`run` in this module (through
    ``asyncio.to_thread``) after the HTTP response is read; no other internal
    callers were found.

    Args:
        xml_data: The raw Atom feed bytes from the arXiv query endpoint.

    Returns:
        list[dict]: One dict per entry, each with ``"title"``, ``"authors"``
        (a list of name strings), ``"abstract"``, and ``"url"`` keys. The
        list is empty when the feed contains no entries.

    Raises:
        xml.etree.ElementTree.ParseError: If ``xml_data`` is not well-formed
            XML. (In normal operation :func:`run` wraps this call in a broad
            ``except`` and reports the failure as an error payload.)
    """
    root = ET.fromstring(xml_data)
    namespace = {"atom": "http://www.w3.org/2005/Atom"}
    results = []
    for entry in root.findall("atom:entry", namespace):
        title_node = entry.find("atom:title", namespace)
        summary_node = entry.find("atom:summary", namespace)
        link_node = entry.find("atom:id", namespace)

        title = (
            title_node.text.strip().replace("\n", " ")
            if title_node is not None
            else "No Title"
        )
        summary = (
            summary_node.text.strip().replace("\n", " ")
            if summary_node is not None
            else "No Summary"
        )
        link = link_node.text.strip() if link_node is not None else "No Link"

        authors = []
        for author in entry.findall("atom:author", namespace):
            name_node = author.find("atom:name", namespace)
            if name_node is not None:
                authors.append(name_node.text.strip())

        results.append(
            {
                "title": title,
                "authors": authors,
                "abstract": summary,
                "url": link,
            }
        )
    return results



[docs]
async def run(query: str, max_results: int = 3):
    """Search arXiv for papers and return their metadata as a JSON string.

    This is the tool entrypoint for ``query_arxiv``. It clamps
    ``max_results`` to a hard ceiling of 10, URL-encodes the query into an
    ``all:`` search expression, and issues an asynchronous ``GET`` against
    the public arXiv export API (``http://export.arxiv.org/api/query``) using
    an :class:`httpx.AsyncClient` with a 10-second timeout and a custom
    ``User-Agent``. The Atom response body is read and handed to
    :func:`_parse_arxiv_atom` via :func:`asyncio.to_thread` so XML parsing
    runs off the event loop. The structured result is serialized with the
    shared :mod:`jsonutil` module (imported as ``json``).

    Any failure -- network error, non-2xx status (via
    ``resp.raise_for_status()``), or XML parse error -- is caught, logged via
    ``logger.exception``, and surfaced to the model as an error payload
    rather than propagated. The only side effects are the outbound HTTP
    request and log output; no Redis, knowledge-graph, or LLM interactions
    occur.

    This is invoked dynamically by the tool dispatch layer when the
    ``query_arxiv`` tool is selected; no direct internal callers were found
    by name.

    Args:
        query: The free-text arXiv search query (e.g.
            ``"quantum entanglement"`` or ``"au:feynman"``).
        max_results: Maximum number of papers to return; values above 10 are
            silently clamped to 10. Defaults to 3.

    Returns:
        str: A JSON-encoded string. On success it has ``"status": "success"``
        and a ``"results"`` list (with a ``"message"`` noting when no papers
        matched); on failure it has ``"status": "error"`` and a ``"message"``
        describing the problem.
    """
    max_results = min(max_results, 10)
    url = (
        f"http://export.arxiv.org/api/query?"
        f"search_query=all:{quote(query)}&start=0&max_results={max_results}"
    )

    try:
        async with httpx.AsyncClient(timeout=10.0) as client:
            resp = await client.get(
                url,
                headers={"User-Agent": "Stargazer-Agent/1.0"},
            )
            resp.raise_for_status()
            xml_data = await resp.aread()

        results = await asyncio.to_thread(_parse_arxiv_atom, xml_data)

        if not results:
            return json.dumps(
                {
                    "status": "success",
                    "message": "No results found for the given query.",
                    "results": [],
                }
            )

        return json.dumps({"status": "success", "results": results})

    except Exception as e:
        logger.exception("query_arxiv failed")
        return json.dumps(
            {
                "status": "error",
                "message": f"Failed to query arXiv: {e!s}",
            }
        )