"""Direct image URL detection and download."""
from __future__ import annotations
import asyncio
import logging
import re
from typing import Any, Dict, Optional
import aiohttp
logger = logging.getLogger(__name__)
_IMAGE_EXTENSIONS = {".png", ".jpg", ".jpeg", ".gif", ".webp", ".bmp"}
_MAX_IMAGE_DOWNLOAD = 20 * 1024 * 1024 # 20 MB
_UA = "StargazerBot/1.0"
# Kept comfortably under the inference media preprocess shield (~30s) so a live
# image fetch lands in history on the SAME turn instead of surfacing next turn.
_HEAD_TIMEOUT = 3
_GET_TIMEOUT = 20
_IMAGE_URL_PATTERN = re.compile(
r"(https?://)"
r"("
r"cdn\.discordapp\.com/attachments/[^\s\"<>'\`\[\]]+"
r"|media\.discordapp\.net/attachments/[^\s\"<>'\`\[\]]+"
r"|i\.imgur\.com/[a-zA-Z0-9]+\.[a-zA-Z]+"
r"|[^\s\"<>'\`\[\]]+\.(?:png|jpe?g|gif|webp|bmp)(?:\?[^\s\"<>'\`\[\]]*)?"
r")",
re.IGNORECASE,
)
[docs]
def is_image_url(url: str) -> bool:
return _IMAGE_URL_PATTERN.search(url) is not None
[docs]
async def download_image_url(
url: str,
) -> Optional[Dict[str, Any]]:
try:
norm = url.strip()
if not norm.startswith("http"):
norm = "https://" + norm
head_ct = ""
async with aiohttp.ClientSession() as s:
# Best-effort HEAD: use it only to early-reject an oversized image.
# Many hosts block HEAD (405) or omit Content-Type, so a failed or
# non-image HEAD must NOT drop the image — fall through to GET and
# let it decide. (Dropping here was silently losing images from
# HEAD-hostile CDNs.)
try:
async with s.head(
norm,
timeout=aiohttp.ClientTimeout(total=_HEAD_TIMEOUT),
headers={"User-Agent": _UA},
allow_redirects=True,
) as head_r:
head_ct = head_r.headers.get("Content-Type", "") or ""
cl = head_r.headers.get("Content-Length")
if cl and cl.isdigit() and int(cl) > _MAX_IMAGE_DOWNLOAD:
logger.info("Image too large (%s bytes): %s", cl, norm)
return None
except Exception:
pass # HEAD-hostile host; rely on the GET below.
async with s.get(
norm,
timeout=aiohttp.ClientTimeout(total=_GET_TIMEOUT),
headers={"User-Agent": _UA},
allow_redirects=True,
) as r:
if r.status != 200:
return None
ct = (r.content_type or head_ct or "").lower()
if not ct.startswith("image/"):
return None
cl = r.headers.get("Content-Length")
if cl and cl.isdigit() and int(cl) > _MAX_IMAGE_DOWNLOAD:
return None
data = await r.read()
if len(data) > _MAX_IMAGE_DOWNLOAD:
return None
mimetype = r.content_type or head_ct or "image/png"
return {
"data": data,
"mimetype": mimetype,
"url": norm,
}
except asyncio.TimeoutError:
logger.error("Timeout downloading image from %s", url)
except Exception:
logger.exception("Error downloading image from %s", url)
return None