Source code for tools.pollinations_tts

"""Text-to-speech via Pollinations.ai API, uploaded to the current channel."""

from __future__ import annotations

import asyncio
import hashlib
import logging
import os
import tempfile
from typing import TYPE_CHECKING
from urllib.parse import quote

if TYPE_CHECKING:
    from tool_context import ToolContext

logger = logging.getLogger(__name__)

POLLINATIONS_AUDIO_BASE = "https://gen.pollinations.ai/audio"

_FORMAT_TO_MIME = {
    "mp3": "audio/mpeg",
    "wav": "audio/wav",
    "opus": "audio/opus",
    "aac": "audio/aac",
    "flac": "audio/flac",
    "pcm": "audio/basic",
}

TOOL_NAME = "pollinations_tts"
TOOL_DESCRIPTION = (
    "Convert text to speech using the Pollinations.ai audio API "
    "(GET https://gen.pollinations.ai/audio/{text} with voice and format "
    "query parameters), save to a temporary audio file, and upload the "
    "result to the current channel."
)
TOOL_PARAMETERS = {
    "type": "object",
    "properties": {
        "text": {
            "type": "string",
            "description": "The text to synthesize as speech (max ~4096 chars).",
        },
        "voice": {
            "type": "string",
            "description": (
                "Voice preset (e.g. alloy, echo, fable, onyx, nova, shimmer, "
                "rachel, adam). Default: nova."
            ),
        },
        "output_format": {
            "type": "string",
            "description": (
                "Audio container: mp3 (default), wav, opus, aac, flac, or pcm."
            ),
        },
        "speed": {
            "type": "number",
            "description": (
                "Optional speech speed (0.25–4.0) when supported by the API."
            ),
        },
    },
    "required": ["text"],
}


def _normalize_voice(voice: str) -> str:
    """Coerce a caller-supplied voice name into a safe lowercase preset.

    Trims surrounding whitespace and lowercases the value, falling back to
    ``"nova"`` whenever the input is empty, ``None``, or whitespace-only. The
    result is forwarded verbatim as the ``voice`` query parameter on the
    Pollinations audio request, so this normalization is purely cosmetic --
    no allow-list is enforced and unknown voices are passed straight through
    to the upstream API. This is a pure function with no side effects.

    This is called by :func:`run` in this module to derive ``voice_n`` before
    building the request parameters; no other internal callers were found.

    Args:
        voice: The raw voice preset requested by the model (e.g. ``"alloy"``,
            ``"nova"``), possibly empty or ``None``.

    Returns:
        str: A trimmed, lowercased voice name, or ``"nova"`` when the input
        resolves to nothing usable.
    """
    v = (voice or "nova").strip().lower()
    return v if v else "nova"


def _normalize_format(fmt: str) -> str:
    """Validate and canonicalize the requested audio container format.

    Trims whitespace, lowercases, and strips a leading dot from the input,
    then checks the result against the supported-format table
    :data:`_FORMAT_TO_MIME`. Anything not present in that table (including
    empty or ``None`` input) falls back to ``"mp3"``, guaranteeing the caller
    always receives a key that can be looked up in :data:`_FORMAT_TO_MIME`
    for the MIME type and used as the temp-file suffix. This is a pure
    function with no side effects.

    This is called by :func:`run` in this module to derive ``fmt`` before
    selecting the MIME type and file suffix; no other internal callers were
    found.

    Args:
        fmt: The raw output format requested by the model (e.g. ``"mp3"``,
            ``".wav"``, ``"OPUS"``), possibly empty or ``None``.

    Returns:
        str: One of the supported format keys (``"mp3"``, ``"wav"``,
        ``"opus"``, ``"aac"``, ``"flac"``, ``"pcm"``), defaulting to
        ``"mp3"`` for unrecognized input.
    """
    f = (fmt or "mp3").strip().lower().lstrip(".")
    return f if f in _FORMAT_TO_MIME else "mp3"


def _write_temp_read_unlink(data: bytes, suffix: str) -> bytes:
    """Round-trip bytes through a temporary file and return them, cleaning up.

    Writes the audio payload to a freshly created temp file, reads it straight
    back, and unlinks the file in a ``finally`` so nothing is left on disk. The
    round-trip exists so the returned bytes have actually been materialized on
    the filesystem (matching how the platform adapter expects staged file
    payloads) while keeping the synchronous, blocking file I/O off the event
    loop -- :func:`run` invokes it through ``asyncio.to_thread``.

    This touches only the local filesystem via ``tempfile.mkstemp`` and
    ``os`` file operations; it performs no network or Redis I/O. Within this
    module it is called once by :func:`run`; no other in-repo callers were
    found.

    Args:
        data: The audio bytes to stage.
        suffix: File suffix (e.g. ``".mp3"``) for the temp file.

    Returns:
        bytes: The same payload, read back from the temp file before deletion.
    """
    fd, path = tempfile.mkstemp(suffix=suffix)
    try:
        with os.fdopen(fd, "wb") as f:
            f.write(data)
        with open(path, "rb") as f:
            return f.read()
    finally:
        try:
            os.unlink(path)
        except OSError:
            pass


[docs] async def run( text: str, voice: str = "nova", output_format: str = "mp3", speed: float | None = None, ctx: ToolContext | None = None, ) -> str: """Synthesize speech via the Pollinations audio API and upload it. Implements the ``pollinations_tts`` tool. It turns text into an audio file by calling the Pollinations.ai audio endpoint, stages the bytes locally, and uploads the result to the channel the tool was invoked in, returning a confirmation (with the file URL when available) for the model to reference. The flow validates the context and text length, resolves the caller's Pollinations API key through ``tools.pollinate._resolve_api_key`` (returning ``tools.manage_api_keys.missing_api_key_error`` when absent), then normalizes the voice and format via :func:`_normalize_voice` and :func:`_normalize_format`. It issues a GET to ``POLLINATIONS_AUDIO_BASE`` with the URL-encoded text and ``voice``/``format``/optional ``speed`` query params using the SSRF-guarded ``tools._safe_http`` client, stages the returned bytes off-loop through :func:`_write_temp_read_unlink`, and uploads them via ``ctx.adapter.send_file`` to ``ctx.channel_id``, also appending a record to ``ctx.sent_files`` so downstream code can reference the upload. It is registered and dispatched dynamically by ``tool_loader.py`` as the module-level ``run`` handler; no in-repo callers invoke it directly outside of tests. Args: text: The text to synthesize; required, trimmed, and capped at 4096 characters. voice: Voice preset name; normalized and defaulted to ``"nova"``. output_format: Audio container; normalized and defaulted to ``"mp3"``. speed: Optional speech speed, applied only when within ``0.25``-``4.0``. ctx: The tool execution context. Must expose a non-``None`` ``adapter``; ``ctx.channel_id`` and ``ctx.sent_files`` are used for the upload. Returns: str: A success message (optionally including the uploaded file URL), or an ``Error:`` prefixed string when the adapter or key is missing, the text is empty or too long, the API returns a non-200 or empty response, or any exception occurs. """ from tools._safe_http import safe_http_request, safe_httpx_client from tools.pollinate import _resolve_api_key if ctx is None or ctx.adapter is None: return "Error: No platform adapter available." text = (text or "").strip() if not text: return "Error: text is empty." if len(text) > 4096: return "Error: text exceeds 4096 characters." api_key, _own = await _resolve_api_key(ctx) if not api_key: from tools.manage_api_keys import missing_api_key_error return missing_api_key_error("pollinations") fmt = _normalize_format(output_format) voice_n = _normalize_voice(voice) mime = _FORMAT_TO_MIME[fmt] suffix = f".{fmt}" encoded = quote(text, safe="") url = f"{POLLINATIONS_AUDIO_BASE}/{encoded}" params: dict[str, str | float] = {"voice": voice_n, "format": fmt} if speed is not None and 0.25 <= float(speed) <= 4.0: params["speed"] = float(speed) headers = {"Authorization": f"Bearer {api_key}"} try: async with safe_httpx_client(timeout=120.0) as http: resp = await safe_http_request( http, "GET", url, params=params, headers=headers, max_redirects=5, ) if resp.status_code != 200: logger.error( "Pollinations TTS HTTP %s: %s", resp.status_code, resp.text[:500], ) return ( f"Error: Pollinations API returned {resp.status_code}: " f"{resp.text[:300]}" ) ct = (resp.headers.get("content-type") or "").split(";")[0].strip().lower() if ct and not ct.startswith("audio/") and "octet-stream" not in ct: logger.warning("Pollinations TTS unexpected content-type: %s", ct) audio_bytes = resp.content if not audio_bytes: return "Error: empty audio response." staged = await asyncio.to_thread( _write_temp_read_unlink, audio_bytes, suffix, ) short = hashlib.sha256(staged).hexdigest()[:12] fname = f"pollinations_tts_{short}{suffix}" file_url = await ctx.adapter.send_file( ctx.channel_id, staged, fname, mime, ) ctx.sent_files.append( { "data": staged, "filename": fname, "mimetype": mime, "file_url": file_url or "", } ) msg = "Successfully uploaded Pollinations TTS audio to the channel." if file_url: msg += f" File URL: {file_url}" return msg except Exception as exc: logger.error("Pollinations TTS error: %s", exc, exc_info=True) return f"Error generating speech: {exc}"