Source code for url_utils.ytdlp_json

"""yt-dlp extractor detection and JSON parsing helpers."""

from __future__ import annotations

import json
import logging

logger = logging.getLogger(__name__)

_ytdlp_extractors: list | None = None


def _get_ytdlp_extractors() -> list:
    global _ytdlp_extractors
    if _ytdlp_extractors is not None:
        return _ytdlp_extractors
    try:
        import yt_dlp

        _ytdlp_extractors = [
            e for e in yt_dlp.extractor.gen_extractors() if e.IE_NAME != "generic"
        ]
    except ImportError:
        logger.warning(
            "yt-dlp not installed — is_ytdlp_supported_url will always return False",
        )
        _ytdlp_extractors = []
    return _ytdlp_extractors



[docs]
def is_ytdlp_supported_url(url: str) -> bool:
    if not url or not url.startswith("http"):
        return False
    for extractor in _get_ytdlp_extractors():
        if extractor.suitable(url):
            return True
    return False



YTDLP_METADATA_NETWORK_ARGS: tuple[str, ...] = (
    "--force-ipv4",
    "--socket-timeout",
    "12",
    "--retries",
    "1",
    "--extractor-retries",
    "1",
)



[docs]
def raise_process_file_limit(*, soft: int = 262_144) -> None:
    try:
        import resource

        cur_soft, hard = resource.getrlimit(resource.RLIMIT_NOFILE)
        target = min(max(soft, cur_soft), hard)
        if target > cur_soft:
            resource.setrlimit(resource.RLIMIT_NOFILE, (target, hard))
    except (ValueError, OSError):
        pass



def _first_json_dict_in_string(s: str) -> dict | None:
    decoder = json.JSONDecoder()
    s = s.strip()
    if not s:
        return None
    for i, c in enumerate(s):
        if c != "{":
            continue
        try:
            obj, _end = decoder.raw_decode(s, i)
            if isinstance(obj, dict):
                return obj
        except json.JSONDecodeError:
            continue
    return None



[docs]
def parse_ytdlp_dump_json_stdout(stdout: bytes | str) -> dict | None:
    if isinstance(stdout, bytes):
        text = stdout.decode("utf-8", errors="replace")
    else:
        text = stdout
    text = text.strip()
    if text.startswith("\ufeff"):
        text = text.lstrip("\ufeff").strip()
    if not text:
        return None
    got = _first_json_dict_in_string(text)
    if got is not None:
        return got
    for line in text.splitlines():
        got = _first_json_dict_in_string(line)
        if got is not None:
            return got
    return None




[docs]
def iter_ytdlp_extractor_test_urls() -> list[tuple[str, str]]:
    try:
        from yt_dlp.extractor import gen_extractors
    except ImportError:
        return []
    seen: set[str] = set()
    out: list[tuple[str, str]] = []
    for ie in gen_extractors():
        name = ie.IE_NAME
        for t in getattr(ie, "_TESTS", None) or []:
            url: str | None = None
            if isinstance(t, dict):
                u = t.get("url")
                if isinstance(u, str) and u.startswith("http"):
                    url = u
            elif isinstance(t, str) and t.startswith("http"):
                url = t
            if url and url not in seen:
                seen.add(url)
                out.append((name, url))
    return out