"""
Tool: query_arxiv
Search the arXiv database for scientific papers and return their abstracts, authors, and URLs.
"""
from __future__ import annotations
import asyncio
import jsonutil as json
import logging
import xml.etree.ElementTree as ET
from urllib.parse import quote
import httpx
logger = logging.getLogger(__name__)
TOOL_NAME = "query_arxiv"
TOOL_DESCRIPTION = "Search the arXiv database for scientific papers and return their abstracts, authors, and URLs."
TOOL_PARAMETERS = {
"type": "object",
"properties": {
"query": {
"type": "string",
"description": "The search query (e.g. 'quantum entanglement' or 'au:feynman').",
},
"max_results": {
"type": "integer",
"description": "Maximum number of results to return (default: 3, max 10).",
},
},
"required": ["query"],
}
def _parse_arxiv_atom(xml_data: bytes) -> list[dict]:
"""Parse an arXiv Atom feed into a list of normalized paper records.
Parses the raw Atom XML returned by the arXiv export API using
:mod:`xml.etree.ElementTree`, iterating over each ``atom:entry`` and
pulling out the title, summary/abstract, canonical id URL, and the list
of author names. Title and summary text are stripped and have embedded
newlines flattened to spaces; missing fields degrade gracefully to
``"No Title"``, ``"No Summary"``, or ``"No Link"`` rather than raising.
This is CPU-bound, synchronous, and has no I/O or other side effects --
:func:`run` offloads it to a worker thread via
:func:`asyncio.to_thread` so it does not block the event loop.
This is called by :func:`run` in this module (through
``asyncio.to_thread``) after the HTTP response is read; no other internal
callers were found.
Args:
xml_data: The raw Atom feed bytes from the arXiv query endpoint.
Returns:
list[dict]: One dict per entry, each with ``"title"``, ``"authors"``
(a list of name strings), ``"abstract"``, and ``"url"`` keys. The
list is empty when the feed contains no entries.
Raises:
xml.etree.ElementTree.ParseError: If ``xml_data`` is not well-formed
XML. (In normal operation :func:`run` wraps this call in a broad
``except`` and reports the failure as an error payload.)
"""
root = ET.fromstring(xml_data)
namespace = {"atom": "http://www.w3.org/2005/Atom"}
results = []
for entry in root.findall("atom:entry", namespace):
title_node = entry.find("atom:title", namespace)
summary_node = entry.find("atom:summary", namespace)
link_node = entry.find("atom:id", namespace)
title = (
title_node.text.strip().replace("\n", " ")
if title_node is not None
else "No Title"
)
summary = (
summary_node.text.strip().replace("\n", " ")
if summary_node is not None
else "No Summary"
)
link = link_node.text.strip() if link_node is not None else "No Link"
authors = []
for author in entry.findall("atom:author", namespace):
name_node = author.find("atom:name", namespace)
if name_node is not None:
authors.append(name_node.text.strip())
results.append(
{
"title": title,
"authors": authors,
"abstract": summary,
"url": link,
}
)
return results
[docs]
async def run(query: str, max_results: int = 3):
"""Search arXiv for papers and return their metadata as a JSON string.
This is the tool entrypoint for ``query_arxiv``. It clamps
``max_results`` to a hard ceiling of 10, URL-encodes the query into an
``all:`` search expression, and issues an asynchronous ``GET`` against
the public arXiv export API (``http://export.arxiv.org/api/query``) using
an :class:`httpx.AsyncClient` with a 10-second timeout and a custom
``User-Agent``. The Atom response body is read and handed to
:func:`_parse_arxiv_atom` via :func:`asyncio.to_thread` so XML parsing
runs off the event loop. The structured result is serialized with the
shared :mod:`jsonutil` module (imported as ``json``).
Any failure -- network error, non-2xx status (via
``resp.raise_for_status()``), or XML parse error -- is caught, logged via
``logger.exception``, and surfaced to the model as an error payload
rather than propagated. The only side effects are the outbound HTTP
request and log output; no Redis, knowledge-graph, or LLM interactions
occur.
This is invoked dynamically by the tool dispatch layer when the
``query_arxiv`` tool is selected; no direct internal callers were found
by name.
Args:
query: The free-text arXiv search query (e.g.
``"quantum entanglement"`` or ``"au:feynman"``).
max_results: Maximum number of papers to return; values above 10 are
silently clamped to 10. Defaults to 3.
Returns:
str: A JSON-encoded string. On success it has ``"status": "success"``
and a ``"results"`` list (with a ``"message"`` noting when no papers
matched); on failure it has ``"status": "error"`` and a ``"message"``
describing the problem.
"""
max_results = min(max_results, 10)
url = (
f"http://export.arxiv.org/api/query?"
f"search_query=all:{quote(query)}&start=0&max_results={max_results}"
)
try:
async with httpx.AsyncClient(timeout=10.0) as client:
resp = await client.get(
url,
headers={"User-Agent": "Stargazer-Agent/1.0"},
)
resp.raise_for_status()
xml_data = await resp.aread()
results = await asyncio.to_thread(_parse_arxiv_atom, xml_data)
if not results:
return json.dumps(
{
"status": "success",
"message": "No results found for the given query.",
"results": [],
}
)
return json.dumps({"status": "success", "results": results})
except Exception as e:
logger.exception("query_arxiv failed")
return json.dumps(
{
"status": "error",
"message": f"Failed to query arXiv: {e!s}",
}
)