Source code for url_utils.detection

"""URL pattern detection helpers."""

from __future__ import annotations

import re


[docs] def is_youtube_url(url: str) -> bool: if "youtu.be/" in url: return re.match( r"(https?://)?(youtu\.be)/([a-zA-Z0-9_-]{11})(\?.*)?$", url, ) is not None return re.match( r"(https?://)?(www\.|m\.)?(youtube|youtube-nocookie)\.(com)/" r"(watch\?v=|embed/|v/|shorts/)([a-zA-Z0-9_-]{11})(\?.*)?$", url, ) is not None
[docs] def is_tenor_url(url: str) -> bool: return ( re.search(r"(https?://)?(www\.|media\.)?tenor\.com/", url) is not None )
[docs] def is_giphy_url(url: str) -> bool: return re.search( r"(https?://)?(www\.)?(giphy\.com/|gph\.is/|media\d*\.giphy\.com/)", url, ) is not None
[docs] def is_tweet_url(url: str) -> bool: return re.search( r"(https?://)?(www\.)?(x|twitter)\.com/[^/]+/status/\d+", url, ) is not None
[docs] def is_spotify_url(url: str) -> bool: return re.search( r"(https?://)?(open\.)?spotify\.com/" r"(track|album|playlist|episode|artist|show)/[a-zA-Z0-9]+", url, ) is not None
[docs] def is_soundcloud_url(url: str) -> bool: return re.search( r"(https?://)?(www\.)?soundcloud\.com/[^/]+/[^/]+", url, ) is not None
[docs] def is_tiktok_url(url: str) -> bool: return re.search( r"(https?://)?(www\.|vm\.)?tiktok\.com/" r"(@[^/]+/video/\d+|[a-zA-Z0-9]+/?)", url, ) is not None
[docs] def is_vimeo_url(url: str) -> bool: return re.search( r"(https?://)?(www\.|player\.)?vimeo\.com/(video/)?\d+", url, ) is not None
[docs] def is_github_url(url: str) -> bool: return re.search( r"(https?://)?(www\.)?github\.com/[^/]+/[^/]+" r"(/issues/\d+|/pull/\d+)?", url, ) is not None
[docs] def is_arxiv_url(url: str) -> bool: return re.search( r"(https?://)?(www\.)?arxiv\.org/(abs|pdf)/[\d\.]+v?\d*", url, ) is not None
[docs] def is_reddit_url(url: str) -> bool: return re.search( r"(https?://)?(www\.|old\.|new\.)?reddit\.com/r/[^/]+/" r"comments/[a-z0-9]+", url, ) is not None
[docs] def is_wikipedia_url(url: str) -> bool: return re.search( r"(https?://)?([a-z]{2,3}\.)?wikipedia\.org/wiki/[^#\s]+", url, ) is not None
[docs] def is_gist_url(url: str) -> bool: return re.search( r"(https?://)?gist\.github\.com/([^/]+/)?[a-f0-9]+", url, ) is not None
[docs] def is_bluesky_url(url: str) -> bool: return re.search( r"(https?://)?bsky\.app/profile/[^/]+/post/[a-z0-9]+", url, ) is not None
[docs] def is_stackoverflow_url(url: str) -> bool: return re.search( r"(https?://)?(www\.)?" r"(stackoverflow\.com|superuser\.com|serverfault\.com|" r"askubuntu\.com|[a-z]+\.stackexchange\.com)/questions/\d+", url, ) is not None
[docs] def is_nvd_cve_url(url: str) -> bool: return re.search( r"(https?://)?(www\.)?nvd\.nist\.gov/vuln/detail/CVE-\d{4}-\d{4,}", url, re.IGNORECASE, ) is not None