"""Generate high-quality music using Google Lyria 3 via the Gemini API.
Supports both the Clip model (30-second clips) and the Pro model
(full-length songs with verses, choruses, and bridges). Features:
- Text-to-music generation with full prompt engineering support
- Multimodal generation from up to 10 reference images (Pro)
- Custom lyrics with structural tags ([Verse], [Chorus], [Bridge], …)
- Timestamp-based structure control for precise arrangements
- WAV or MP3 output (WAV available on Pro model only)
- Generated lyrics/song structure returned alongside the audio file
"""
from __future__ import annotations
import asyncio
import base64
import hashlib
import jsonutil as json
import logging
from typing import Any, TYPE_CHECKING
if TYPE_CHECKING:
from tool_context import ToolContext
logger = logging.getLogger(__name__)
GEMINI_API_BASE = "https://generativelanguage.googleapis.com/v1beta"
LYRIA_CLIP_MODEL = "lyria-3-clip-preview"
LYRIA_PRO_MODEL = "lyria-3-pro-preview"
DEFAULT_MODEL = LYRIA_CLIP_MODEL
FALLBACK_API_KEY = "AIzaSyCCwz9WCsIKSWsfufU6E-JbPsP1acLhZTU"
_MUSIC_DAILY_LIMIT = 10
_MAX_INPUT_IMAGES = 10
TOOL_NAME = "generate_lyria_music"
TOOL_DESCRIPTION = (
"Generate high-quality 44.1 kHz stereo music using Google Lyria 3 "
"(Gemini API). Use 'lyria-3-clip-preview' for instant 30-second clips "
"or 'lyria-3-pro-preview' for full-length songs with verses, choruses, "
"and bridges. Supports rich text prompts (genre, instruments, BPM, key, "
"mood, structure tags, timestamps, custom lyrics, multi-language), "
"optional reference images for image-to-music, and WAV or MP3 output. "
"The generated audio and any lyrics/structure notes are delivered to the "
"current channel."
)
TOOL_PARAMETERS = {
"type": "object",
"properties": {
"prompt": {
"type": "string",
"description": (
"Music generation prompt. Include as many details as "
"possible for best results:\n"
"- Genre/style: 'lo-fi hip hop', 'cinematic orchestral'\n"
"- Instruments: 'Fender Rhodes piano', 'slide guitar'\n"
"- BPM: '120 BPM', 'slow around 70 BPM'\n"
"- Key/scale: 'in G major', 'D minor'\n"
"- Mood: 'nostalgic', 'aggressive', 'ethereal'\n"
"- Structure tags: [Verse], [Chorus], [Bridge], "
"[Intro], [Outro]\n"
"- Timestamps: '[0:00 - 0:10] Intro: soft piano …'\n"
"- Custom lyrics: include them directly with section tags\n"
"- Language: prompt in the target language for "
"non-English lyrics\n"
"- For instrumentals: add 'Instrumental only, no vocals'\n"
"- Duration (Pro): 'create a 2-minute song', or use "
"timestamps to control length"
),
},
"model": {
"type": "string",
"description": (
"Lyria model to use. "
"'lyria-3-clip-preview' (default): generates a fixed "
"30-second clip, great for loops and quick previews. "
"'lyria-3-pro-preview': generates a full-length song "
"lasting a couple of minutes, with full structural "
"understanding of verses, choruses, and bridges."
),
},
"output_format": {
"type": "string",
"description": (
"Audio output format. 'mp3' (default) works with both "
"models. 'wav' produces higher-fidelity uncompressed "
"audio but is only supported with 'lyria-3-pro-preview'; "
"using wav with the clip model falls back to mp3."
),
},
"image_urls": {
"type": "array",
"items": {"type": "string"},
"description": (
"Optional list of up to 10 publicly accessible image "
"URLs. When provided, the model composes music inspired "
"by the visual content alongside your text prompt. "
"Recommended with 'lyria-3-pro-preview'."
),
},
},
"required": ["prompt"],
}
async def _resolve_api_key(ctx: ToolContext | None) -> tuple[str, bool]:
"""Resolve which Gemini API key to use, preferring the caller's own key.
Decides whether a generation runs on the user's own quota or the shared
fallback key, which in turn drives whether :func:`run` enforces the daily
rate limit. It first tries to look up a per-user Gemini key, and only when
that is absent or fails does it fall back to the bundled
:data:`FALLBACK_API_KEY`.
Lazily imports :func:`tools.manage_api_keys.get_user_api_key` and calls it
with ``ctx.user_id`` plus ``ctx.redis`` / ``ctx.channel_id`` /
``ctx.config`` (the key store is Redis-backed, so this read touches Redis);
lookup failures are swallowed and logged via the module ``logger`` rather
than raised. Called once by :func:`run` before any Lyria request.
Args:
ctx: The :class:`ToolContext`; ``None`` or a missing ``user_id`` skips
straight to the fallback key.
Returns:
tuple[str, bool]: ``(api_key, using_own_key)`` where ``using_own_key``
is ``True`` only when a per-user key was resolved.
"""
if ctx is not None and getattr(ctx, "user_id", None):
try:
from tools.manage_api_keys import get_user_api_key
user_key = await get_user_api_key(
ctx.user_id,
"gemini",
redis_client=getattr(ctx, "redis", None),
channel_id=getattr(ctx, "channel_id", None),
config=getattr(ctx, "config", None),
)
if user_key:
return user_key, True
except Exception as exc:
logger.warning("Failed to resolve user Gemini key: %s", exc)
return FALLBACK_API_KEY, False
async def _download_image(url: str) -> tuple[bytes, str] | None:
"""Fetch a reference image from a URL for image-to-music generation.
Retrieves the bytes behind a user-supplied image URL so they can be sent
to Lyria alongside the text prompt (the Pro model can compose music
inspired by visual input). Every fetch goes through the repo's SSRF-safe
HTTP layer so untrusted URLs cannot reach internal hosts.
Lazily imports from :mod:`tools._safe_http` and validates the URL with
:func:`assert_safe_http_url` before issuing a redirect-bounded GET via
:func:`safe_http_request` inside a :func:`safe_httpx_client`. The only
side effect is the outbound HTTP request; any blocked URL, non-200
response, or exception is logged via the module ``logger`` and turned into
``None`` rather than propagated. Called by :func:`_call_lyria`, which
gathers these downloads concurrently for the supplied ``image_urls``.
Args:
url: The image URL to fetch; leading/trailing whitespace is stripped
and the URL is safety-validated.
Returns:
tuple[bytes, str] | None: ``(image_bytes, mime_type)`` on success
(defaulting the MIME type to ``image/jpeg`` when the server reports a
non-image content type), or ``None`` if the URL is unsafe, the request
fails, or a non-200 status is returned.
"""
from tools._safe_http import (
assert_safe_http_url,
safe_http_request,
safe_httpx_client,
)
try:
url = assert_safe_http_url(url.strip())
except (ValueError, ImportError) as exc:
logger.warning("Blocked or invalid image URL (%s): %s", url, exc)
return None
try:
async with safe_httpx_client(timeout=30.0) as client:
resp = await safe_http_request(client, "GET", url, max_redirects=5)
if resp.status_code != 200:
logger.warning(
"Image download returned %d for %s",
resp.status_code,
url,
)
return None
content_type = (
resp.headers.get("content-type", "image/jpeg").split(";")[0].strip()
)
if not content_type.startswith("image/"):
content_type = "image/jpeg"
return resp.content, content_type
except Exception as exc:
logger.warning("Image download error for %s: %s", url, exc)
return None
async def _call_lyria(
prompt: str,
model: str,
output_format: str,
image_urls: list[str] | None,
api_key: str,
) -> dict[str, Any]:
"""Call the Lyria 3 ``generateContent`` endpoint and unpack the audio.
The core network step of this tool: it builds the multimodal request
(text prompt plus any reference images), invokes the Gemini Lyria model,
and extracts the generated audio together with any lyrics/structure text
the model returns. It handles both the Clip and Pro models and only
requests WAV output when the Pro model is selected.
Lazily imports from :mod:`tools._safe_http`, optionally fans out reference
image fetches via :func:`_download_image` (gathered concurrently and
base64-inlined), then POSTs to ``{GEMINI_API_BASE}/models/{model}:generateContent``
through :func:`safe_http_request` inside a :func:`safe_httpx_client`,
authenticating with the ``x-goog-api-key`` header. The outbound HTTP call
is the only side effect; the response is JSON-decoded (offloaded to a
thread when large, since base64 audio is heavy) and all parts are scanned
regardless of order to collect text and decode the audio. API errors,
request failures, JSON-decode failures, an empty candidate list, and audio
base64 failures are caught and logged via the module ``logger`` rather
than raised. Called once by :func:`run`.
Args:
prompt: The music description / lyrics / structure prompt.
model: Either :data:`LYRIA_CLIP_MODEL` or :data:`LYRIA_PRO_MODEL`.
output_format: ``"mp3"`` or ``"wav"`` (WAV honored only on the Pro
model).
image_urls: Optional reference image URLs (truncated to
:data:`_MAX_INPUT_IMAGES`).
api_key: The resolved Gemini API key to authenticate with.
Returns:
dict[str, Any]: A result dict with ``audio_bytes`` (``bytes`` or
``None``), ``mime_type``, ``lyrics`` (the response text parts), and
``error`` (``None`` on success, otherwise a human-readable message).
"""
from tools._safe_http import safe_http_request, safe_httpx_client
endpoint = f"{GEMINI_API_BASE}/models/{model}:generateContent"
# Build content parts: text prompt first, then optional images
parts: list[dict[str, Any]] = [{"text": prompt}]
if image_urls:
download_tasks = [_download_image(u) for u in image_urls[:_MAX_INPUT_IMAGES]]
downloaded = await asyncio.gather(*download_tasks)
for result in downloaded:
if result is not None:
img_bytes, mime_type = result
parts.append(
{
"inlineData": {
"mimeType": mime_type,
"data": base64.b64encode(img_bytes).decode("utf-8"),
},
}
)
generation_config: dict[str, Any] = {
"responseModalities": ["AUDIO", "TEXT"],
}
# WAV output only supported on Pro model
if output_format == "wav" and model == LYRIA_PRO_MODEL:
generation_config["responseMimeType"] = "audio/wav"
payload: dict[str, Any] = {
"contents": [{"parts": parts}],
"generationConfig": generation_config,
}
headers = {
"x-goog-api-key": api_key,
"Content-Type": "application/json",
}
try:
async with safe_httpx_client(timeout=300.0) as http:
resp = await safe_http_request(
http,
"POST",
endpoint,
headers=headers,
json=payload,
max_redirects=5,
)
if resp.status_code != 200:
err_body = resp.text[:800]
logger.error(
"Lyria API error %d: %s",
resp.status_code,
err_body,
)
return {
"audio_bytes": None,
"mime_type": "audio/mpeg",
"lyrics": [],
"error": f"Lyria API error ({resp.status_code}): {err_body}",
}
raw_body = resp.content
except Exception as exc:
logger.error("Lyria API request failed: %s", exc, exc_info=True)
return {
"audio_bytes": None,
"mime_type": "audio/mpeg",
"lyrics": [],
"error": f"Request to Lyria API failed: {exc}",
}
# Decode JSON; do it in a thread if the body is large (base64 audio can be big)
try:
if len(raw_body) > 256 * 1024:
response_json = await asyncio.to_thread(
lambda: json.loads(raw_body.decode("utf-8"))
)
else:
response_json = json.loads(raw_body.decode("utf-8"))
except Exception as exc:
logger.error("JSON decode of Lyria response failed: %s", exc)
return {
"audio_bytes": None,
"mime_type": "audio/mpeg",
"lyrics": [],
"error": f"Failed to parse Lyria API response: {exc}",
}
candidates = response_json.get("candidates", [])
if not candidates:
return {
"audio_bytes": None,
"mime_type": "audio/mpeg",
"lyrics": [],
"error": "Lyria API returned no candidates (prompt may have been blocked by safety filters).",
}
response_parts = candidates[0].get("content", {}).get("parts", [])
lyrics: list[str] = []
audio_bytes: bytes | None = None
audio_mime = "audio/mpeg"
# Iterate all parts regardless of order (spec says don't assume ordering)
for part in response_parts:
text = part.get("text")
inline = part.get("inlineData")
if text is not None:
lyrics.append(text)
elif inline:
try:
audio_bytes = base64.b64decode(inline.get("data", ""))
audio_mime = inline.get("mimeType", "audio/mpeg")
except Exception as exc:
logger.error("Base64 decode of audio data failed: %s", exc)
return {
"audio_bytes": audio_bytes,
"mime_type": audio_mime,
"lyrics": lyrics,
"error": None,
}
[docs]
async def run(
prompt: str,
model: str = DEFAULT_MODEL,
output_format: str = "mp3",
image_urls: list[str] | None = None,
ctx: "ToolContext | None" = None,
) -> str:
"""Generate music with Google Lyria 3 and send the audio to the channel.
Args:
prompt: Music description / lyrics / structure for the model.
model: 'lyria-3-clip-preview' (30 s) or 'lyria-3-pro-preview' (full song).
output_format: 'mp3' (default) or 'wav' (Pro only).
image_urls: Optional list of image URLs for image-to-music generation.
ctx: Tool execution context providing platform adapter and Redis.
Returns:
str: JSON result with success status, filename, lyrics, and file URL.
"""
if ctx is None or ctx.adapter is None:
return json.dumps({"error": "No platform adapter available."})
# Normalise model
if model not in (LYRIA_CLIP_MODEL, LYRIA_PRO_MODEL):
model = DEFAULT_MODEL
# Normalise format
output_format = (output_format or "mp3").lower().strip()
if output_format not in ("mp3", "wav"):
output_format = "mp3"
# WAV requires the Pro model
if output_format == "wav" and model != LYRIA_PRO_MODEL:
output_format = "mp3"
api_key, using_own_key = await _resolve_api_key(ctx)
# Enforce daily limit when using the shared fallback key
# (exempt: admin, BYPASS_RATELIMIT privilege, own key)
if not using_own_key:
from tools.manage_api_keys import (
check_default_key_limit,
default_key_limit_applies,
default_key_limit_error,
)
if await default_key_limit_applies(ctx):
allowed, current, limit = await check_default_key_limit(
ctx.user_id,
TOOL_NAME,
ctx.redis,
daily_limit=_MUSIC_DAILY_LIMIT,
)
if not allowed:
return default_key_limit_error(TOOL_NAME, current, limit)
try:
result = await _call_lyria(
prompt=prompt,
model=model,
output_format=output_format,
image_urls=image_urls,
api_key=api_key,
)
except Exception as exc:
logger.error("Lyria generation error: %s", exc, exc_info=True)
return json.dumps({"error": f"Music generation failed: {exc}"})
if result["error"]:
return json.dumps({"error": result["error"]})
audio_bytes = result["audio_bytes"]
if not audio_bytes:
return json.dumps({"error": "Lyria returned no audio data."})
mime_type: str = result["mime_type"]
ext = "wav" if "wav" in mime_type else "mp3"
h = hashlib.sha256(audio_bytes).hexdigest()[:16]
fname = f"lyria_{h}.{ext}"
try:
file_url = await ctx.adapter.send_file(
ctx.channel_id,
audio_bytes,
fname,
mime_type,
)
ctx.sent_files.append(
{
"data": audio_bytes,
"filename": fname,
"mimetype": mime_type,
"file_url": file_url or "",
}
)
except Exception as exc:
logger.error("Failed to send Lyria audio file: %s", exc, exc_info=True)
return json.dumps({"error": f"Audio generated but upload failed: {exc}"})
# Increment shared-key counter after a successful generation
if not using_own_key:
from tools.manage_api_keys import (
default_key_limit_applies,
increment_default_key_usage,
)
if await default_key_limit_applies(ctx):
await increment_default_key_usage(ctx.user_id, TOOL_NAME, ctx.redis)
out: dict[str, Any] = {
"success": True,
"model": model,
"filename": fname,
"format": ext,
"size_bytes": len(audio_bytes),
}
if result["lyrics"]:
out["lyrics_and_structure"] = "\n\n".join(result["lyrics"])
if file_url:
out["file_url"] = file_url
return json.dumps(out)