Source code for tools.query_arxiv

"""
Tool: query_arxiv
Search the arXiv database for scientific papers and return their abstracts, authors, and URLs.
"""

from __future__ import annotations

import asyncio
import jsonutil as json
import logging
import xml.etree.ElementTree as ET
from urllib.parse import quote

import httpx

logger = logging.getLogger(__name__)

TOOL_NAME = "query_arxiv"
TOOL_DESCRIPTION = "Search the arXiv database for scientific papers and return their abstracts, authors, and URLs."
TOOL_PARAMETERS = {
    "type": "object",
    "properties": {
        "query": {
            "type": "string",
            "description": "The search query (e.g. 'quantum entanglement' or 'au:feynman').",
        },
        "max_results": {
            "type": "integer",
            "description": "Maximum number of results to return (default: 3, max 10).",
        },
    },
    "required": ["query"],
}


def _parse_arxiv_atom(xml_data: bytes) -> list[dict]:
    """Parse an arXiv Atom feed into a list of normalized paper records.

    Parses the raw Atom XML returned by the arXiv export API using
    :mod:`xml.etree.ElementTree`, iterating over each ``atom:entry`` and
    pulling out the title, summary/abstract, canonical id URL, and the list
    of author names. Title and summary text are stripped and have embedded
    newlines flattened to spaces; missing fields degrade gracefully to
    ``"No Title"``, ``"No Summary"``, or ``"No Link"`` rather than raising.
    This is CPU-bound, synchronous, and has no I/O or other side effects --
    :func:`run` offloads it to a worker thread via
    :func:`asyncio.to_thread` so it does not block the event loop.

    This is called by :func:`run` in this module (through
    ``asyncio.to_thread``) after the HTTP response is read; no other internal
    callers were found.

    Args:
        xml_data: The raw Atom feed bytes from the arXiv query endpoint.

    Returns:
        list[dict]: One dict per entry, each with ``"title"``, ``"authors"``
        (a list of name strings), ``"abstract"``, and ``"url"`` keys. The
        list is empty when the feed contains no entries.

    Raises:
        xml.etree.ElementTree.ParseError: If ``xml_data`` is not well-formed
            XML. (In normal operation :func:`run` wraps this call in a broad
            ``except`` and reports the failure as an error payload.)
    """
    root = ET.fromstring(xml_data)
    namespace = {"atom": "http://www.w3.org/2005/Atom"}
    results = []
    for entry in root.findall("atom:entry", namespace):
        title_node = entry.find("atom:title", namespace)
        summary_node = entry.find("atom:summary", namespace)
        link_node = entry.find("atom:id", namespace)

        title = (
            title_node.text.strip().replace("\n", " ")
            if title_node is not None
            else "No Title"
        )
        summary = (
            summary_node.text.strip().replace("\n", " ")
            if summary_node is not None
            else "No Summary"
        )
        link = link_node.text.strip() if link_node is not None else "No Link"

        authors = []
        for author in entry.findall("atom:author", namespace):
            name_node = author.find("atom:name", namespace)
            if name_node is not None:
                authors.append(name_node.text.strip())

        results.append(
            {
                "title": title,
                "authors": authors,
                "abstract": summary,
                "url": link,
            }
        )
    return results


[docs] async def run(query: str, max_results: int = 3): """Search arXiv for papers and return their metadata as a JSON string. This is the tool entrypoint for ``query_arxiv``. It clamps ``max_results`` to a hard ceiling of 10, URL-encodes the query into an ``all:`` search expression, and issues an asynchronous ``GET`` against the public arXiv export API (``http://export.arxiv.org/api/query``) using an :class:`httpx.AsyncClient` with a 10-second timeout and a custom ``User-Agent``. The Atom response body is read and handed to :func:`_parse_arxiv_atom` via :func:`asyncio.to_thread` so XML parsing runs off the event loop. The structured result is serialized with the shared :mod:`jsonutil` module (imported as ``json``). Any failure -- network error, non-2xx status (via ``resp.raise_for_status()``), or XML parse error -- is caught, logged via ``logger.exception``, and surfaced to the model as an error payload rather than propagated. The only side effects are the outbound HTTP request and log output; no Redis, knowledge-graph, or LLM interactions occur. This is invoked dynamically by the tool dispatch layer when the ``query_arxiv`` tool is selected; no direct internal callers were found by name. Args: query: The free-text arXiv search query (e.g. ``"quantum entanglement"`` or ``"au:feynman"``). max_results: Maximum number of papers to return; values above 10 are silently clamped to 10. Defaults to 3. Returns: str: A JSON-encoded string. On success it has ``"status": "success"`` and a ``"results"`` list (with a ``"message"`` noting when no papers matched); on failure it has ``"status": "error"`` and a ``"message"`` describing the problem. """ max_results = min(max_results, 10) url = ( f"http://export.arxiv.org/api/query?" f"search_query=all:{quote(query)}&start=0&max_results={max_results}" ) try: async with httpx.AsyncClient(timeout=10.0) as client: resp = await client.get( url, headers={"User-Agent": "Stargazer-Agent/1.0"}, ) resp.raise_for_status() xml_data = await resp.aread() results = await asyncio.to_thread(_parse_arxiv_atom, xml_data) if not results: return json.dumps( { "status": "success", "message": "No results found for the given query.", "results": [], } ) return json.dumps({"status": "success", "results": results}) except Exception as e: logger.exception("query_arxiv failed") return json.dumps( { "status": "error", "message": f"Failed to query arXiv: {e!s}", } )