"""Text-to-speech via Pollinations.ai API, uploaded to the current channel."""
from __future__ import annotations
import asyncio
import hashlib
import logging
import os
import tempfile
from typing import TYPE_CHECKING
from urllib.parse import quote
if TYPE_CHECKING:
from tool_context import ToolContext
logger = logging.getLogger(__name__)
POLLINATIONS_AUDIO_BASE = "https://gen.pollinations.ai/audio"
_FORMAT_TO_MIME = {
"mp3": "audio/mpeg",
"wav": "audio/wav",
"opus": "audio/opus",
"aac": "audio/aac",
"flac": "audio/flac",
"pcm": "audio/basic",
}
TOOL_NAME = "pollinations_tts"
TOOL_DESCRIPTION = (
"Convert text to speech using the Pollinations.ai audio API "
"(GET https://gen.pollinations.ai/audio/{text} with voice and format "
"query parameters), save to a temporary audio file, and upload the "
"result to the current channel."
)
TOOL_PARAMETERS = {
"type": "object",
"properties": {
"text": {
"type": "string",
"description": "The text to synthesize as speech (max ~4096 chars).",
},
"voice": {
"type": "string",
"description": (
"Voice preset (e.g. alloy, echo, fable, onyx, nova, shimmer, "
"rachel, adam). Default: nova."
),
},
"output_format": {
"type": "string",
"description": (
"Audio container: mp3 (default), wav, opus, aac, flac, or pcm."
),
},
"speed": {
"type": "number",
"description": (
"Optional speech speed (0.25–4.0) when supported by the API."
),
},
},
"required": ["text"],
}
def _normalize_voice(voice: str) -> str:
"""Coerce a caller-supplied voice name into a safe lowercase preset.
Trims surrounding whitespace and lowercases the value, falling back to
``"nova"`` whenever the input is empty, ``None``, or whitespace-only. The
result is forwarded verbatim as the ``voice`` query parameter on the
Pollinations audio request, so this normalization is purely cosmetic --
no allow-list is enforced and unknown voices are passed straight through
to the upstream API. This is a pure function with no side effects.
This is called by :func:`run` in this module to derive ``voice_n`` before
building the request parameters; no other internal callers were found.
Args:
voice: The raw voice preset requested by the model (e.g. ``"alloy"``,
``"nova"``), possibly empty or ``None``.
Returns:
str: A trimmed, lowercased voice name, or ``"nova"`` when the input
resolves to nothing usable.
"""
v = (voice or "nova").strip().lower()
return v if v else "nova"
def _normalize_format(fmt: str) -> str:
"""Validate and canonicalize the requested audio container format.
Trims whitespace, lowercases, and strips a leading dot from the input,
then checks the result against the supported-format table
:data:`_FORMAT_TO_MIME`. Anything not present in that table (including
empty or ``None`` input) falls back to ``"mp3"``, guaranteeing the caller
always receives a key that can be looked up in :data:`_FORMAT_TO_MIME`
for the MIME type and used as the temp-file suffix. This is a pure
function with no side effects.
This is called by :func:`run` in this module to derive ``fmt`` before
selecting the MIME type and file suffix; no other internal callers were
found.
Args:
fmt: The raw output format requested by the model (e.g. ``"mp3"``,
``".wav"``, ``"OPUS"``), possibly empty or ``None``.
Returns:
str: One of the supported format keys (``"mp3"``, ``"wav"``,
``"opus"``, ``"aac"``, ``"flac"``, ``"pcm"``), defaulting to
``"mp3"`` for unrecognized input.
"""
f = (fmt or "mp3").strip().lower().lstrip(".")
return f if f in _FORMAT_TO_MIME else "mp3"
def _write_temp_read_unlink(data: bytes, suffix: str) -> bytes:
"""Round-trip bytes through a temporary file and return them, cleaning up.
Writes the audio payload to a freshly created temp file, reads it straight
back, and unlinks the file in a ``finally`` so nothing is left on disk. The
round-trip exists so the returned bytes have actually been materialized on
the filesystem (matching how the platform adapter expects staged file
payloads) while keeping the synchronous, blocking file I/O off the event
loop -- :func:`run` invokes it through ``asyncio.to_thread``.
This touches only the local filesystem via ``tempfile.mkstemp`` and
``os`` file operations; it performs no network or Redis I/O. Within this
module it is called once by :func:`run`; no other in-repo callers were
found.
Args:
data: The audio bytes to stage.
suffix: File suffix (e.g. ``".mp3"``) for the temp file.
Returns:
bytes: The same payload, read back from the temp file before deletion.
"""
fd, path = tempfile.mkstemp(suffix=suffix)
try:
with os.fdopen(fd, "wb") as f:
f.write(data)
with open(path, "rb") as f:
return f.read()
finally:
try:
os.unlink(path)
except OSError:
pass
[docs]
async def run(
text: str,
voice: str = "nova",
output_format: str = "mp3",
speed: float | None = None,
ctx: ToolContext | None = None,
) -> str:
"""Synthesize speech via the Pollinations audio API and upload it.
Implements the ``pollinations_tts`` tool. It turns text into an audio file
by calling the Pollinations.ai audio endpoint, stages the bytes locally, and
uploads the result to the channel the tool was invoked in, returning a
confirmation (with the file URL when available) for the model to reference.
The flow validates the context and text length, resolves the caller's
Pollinations API key through ``tools.pollinate._resolve_api_key`` (returning
``tools.manage_api_keys.missing_api_key_error`` when absent), then normalizes
the voice and format via :func:`_normalize_voice` and
:func:`_normalize_format`. It issues a GET to ``POLLINATIONS_AUDIO_BASE``
with the URL-encoded text and ``voice``/``format``/optional ``speed`` query
params using the SSRF-guarded ``tools._safe_http`` client, stages the
returned bytes off-loop through :func:`_write_temp_read_unlink`, and uploads
them via ``ctx.adapter.send_file`` to ``ctx.channel_id``, also appending a
record to ``ctx.sent_files`` so downstream code can reference the upload.
It is registered and dispatched dynamically by ``tool_loader.py`` as the
module-level ``run`` handler; no in-repo callers invoke it directly outside
of tests.
Args:
text: The text to synthesize; required, trimmed, and capped at 4096
characters.
voice: Voice preset name; normalized and defaulted to ``"nova"``.
output_format: Audio container; normalized and defaulted to ``"mp3"``.
speed: Optional speech speed, applied only when within ``0.25``-``4.0``.
ctx: The tool execution context. Must expose a non-``None`` ``adapter``;
``ctx.channel_id`` and ``ctx.sent_files`` are used for the upload.
Returns:
str: A success message (optionally including the uploaded file URL), or
an ``Error:`` prefixed string when the adapter or key is missing, the
text is empty or too long, the API returns a non-200 or empty response,
or any exception occurs.
"""
from tools._safe_http import safe_http_request, safe_httpx_client
from tools.pollinate import _resolve_api_key
if ctx is None or ctx.adapter is None:
return "Error: No platform adapter available."
text = (text or "").strip()
if not text:
return "Error: text is empty."
if len(text) > 4096:
return "Error: text exceeds 4096 characters."
api_key, _own = await _resolve_api_key(ctx)
if not api_key:
from tools.manage_api_keys import missing_api_key_error
return missing_api_key_error("pollinations")
fmt = _normalize_format(output_format)
voice_n = _normalize_voice(voice)
mime = _FORMAT_TO_MIME[fmt]
suffix = f".{fmt}"
encoded = quote(text, safe="")
url = f"{POLLINATIONS_AUDIO_BASE}/{encoded}"
params: dict[str, str | float] = {"voice": voice_n, "format": fmt}
if speed is not None and 0.25 <= float(speed) <= 4.0:
params["speed"] = float(speed)
headers = {"Authorization": f"Bearer {api_key}"}
try:
async with safe_httpx_client(timeout=120.0) as http:
resp = await safe_http_request(
http,
"GET",
url,
params=params,
headers=headers,
max_redirects=5,
)
if resp.status_code != 200:
logger.error(
"Pollinations TTS HTTP %s: %s",
resp.status_code,
resp.text[:500],
)
return (
f"Error: Pollinations API returned {resp.status_code}: "
f"{resp.text[:300]}"
)
ct = (resp.headers.get("content-type") or "").split(";")[0].strip().lower()
if ct and not ct.startswith("audio/") and "octet-stream" not in ct:
logger.warning("Pollinations TTS unexpected content-type: %s", ct)
audio_bytes = resp.content
if not audio_bytes:
return "Error: empty audio response."
staged = await asyncio.to_thread(
_write_temp_read_unlink,
audio_bytes,
suffix,
)
short = hashlib.sha256(staged).hexdigest()[:12]
fname = f"pollinations_tts_{short}{suffix}"
file_url = await ctx.adapter.send_file(
ctx.channel_id,
staged,
fname,
mime,
)
ctx.sent_files.append(
{
"data": staged,
"filename": fname,
"mimetype": mime,
"file_url": file_url or "",
}
)
msg = "Successfully uploaded Pollinations TTS audio to the channel."
if file_url:
msg += f" File URL: {file_url}"
return msg
except Exception as exc:
logger.error("Pollinations TTS error: %s", exc, exc_info=True)
return f"Error generating speech: {exc}"