Source code for url_utils.detection

"""URL pattern detection helpers."""

from __future__ import annotations

import re



[docs]
def is_youtube_url(url: str) -> bool:
    if "youtu.be/" in url:
        return (
            re.match(
                r"(https?://)?(youtu\.be)/([a-zA-Z0-9_-]{11})(\?.*)?$",
                url,
            )
            is not None
        )
    return (
        re.match(
            r"(https?://)?(www\.|m\.)?(youtube|youtube-nocookie)\.(com)/"
            r"(watch\?v=|embed/|v/|shorts/)([a-zA-Z0-9_-]{11})(\?.*)?$",
            url,
        )
        is not None
    )




[docs]
def is_tenor_url(url: str) -> bool:
    return re.search(r"(https?://)?(www\.|media\.)?tenor\.com/", url) is not None




[docs]
def is_giphy_url(url: str) -> bool:
    return (
        re.search(
            r"(https?://)?(www\.)?(giphy\.com/|gph\.is/|media\d*\.giphy\.com/)",
            url,
        )
        is not None
    )




[docs]
def is_tweet_url(url: str) -> bool:
    return (
        re.search(
            r"(https?://)?(www\.)?(x|twitter)\.com/[^/]+/status/\d+",
            url,
        )
        is not None
    )




[docs]
def is_spotify_url(url: str) -> bool:
    return (
        re.search(
            r"(https?://)?(open\.)?spotify\.com/"
            r"(track|album|playlist|episode|artist|show)/[a-zA-Z0-9]+",
            url,
        )
        is not None
    )




[docs]
def is_soundcloud_url(url: str) -> bool:
    return (
        re.search(
            r"(https?://)?(www\.)?soundcloud\.com/[^/]+/[^/]+",
            url,
        )
        is not None
    )




[docs]
def is_tiktok_url(url: str) -> bool:
    return (
        re.search(
            r"(https?://)?(www\.|vm\.)?tiktok\.com/"
            r"(@[^/]+/video/\d+|[a-zA-Z0-9]+/?)",
            url,
        )
        is not None
    )




[docs]
def is_vimeo_url(url: str) -> bool:
    return (
        re.search(
            r"(https?://)?(www\.|player\.)?vimeo\.com/(video/)?\d+",
            url,
        )
        is not None
    )




[docs]
def is_github_url(url: str) -> bool:
    return (
        re.search(
            r"(https?://)?(www\.)?github\.com/[^/]+/[^/]+" r"(/issues/\d+|/pull/\d+)?",
            url,
        )
        is not None
    )




[docs]
def is_arxiv_url(url: str) -> bool:
    return (
        re.search(
            r"(https?://)?(www\.)?arxiv\.org/(abs|pdf)/[\d\.]+v?\d*",
            url,
        )
        is not None
    )




[docs]
def is_reddit_url(url: str) -> bool:
    return (
        re.search(
            r"(https?://)?(www\.|old\.|new\.)?reddit\.com/r/[^/]+/"
            r"comments/[a-z0-9]+",
            url,
        )
        is not None
    )




[docs]
def is_wikipedia_url(url: str) -> bool:
    return (
        re.search(
            r"(https?://)?([a-z]{2,3}\.)?wikipedia\.org/wiki/[^#\s]+",
            url,
        )
        is not None
    )




[docs]
def is_gist_url(url: str) -> bool:
    return (
        re.search(
            r"(https?://)?gist\.github\.com/([^/]+/)?[a-f0-9]+",
            url,
        )
        is not None
    )




[docs]
def is_bluesky_url(url: str) -> bool:
    return (
        re.search(
            r"(https?://)?bsky\.app/profile/[^/]+/post/[a-z0-9]+",
            url,
        )
        is not None
    )




[docs]
def is_stackoverflow_url(url: str) -> bool:
    return (
        re.search(
            r"(https?://)?(www\.)?"
            r"(stackoverflow\.com|superuser\.com|serverfault\.com|"
            r"askubuntu\.com|[a-z]+\.stackexchange\.com)/questions/\d+",
            url,
        )
        is not None
    )




[docs]
def is_nvd_cve_url(url: str) -> bool:
    return (
        re.search(
            r"(https?://)?(www\.)?nvd\.nist\.gov/vuln/detail/CVE-\d{4}-\d{4,}",
            url,
            re.IGNORECASE,
        )
        is not None
    )