import re
from urllib.parse import urlparse, urlunparse


class UrlUtil:
    telegram_private = re.compile(r"https:\/\/t\.me(\/c)\/(.+)\/(\d+)")
    is_istagram = re.compile(r"https:\/\/www\.instagram\.com")

    @staticmethod
    def clean(url: str) -> str: return url

    @staticmethod
    def is_auth_wall(url: str) -> bool:
        """
        checks if URL is behind an authentication wall meaning steps like wayback, wacz, ... may not work
        """
        if UrlUtil.telegram_private.match(url): return True
        if UrlUtil.is_istagram.match(url): return True

        return False

    @staticmethod
    def remove_get_parameters(url: str) -> str:
        # http://example.com/file.mp4?t=1 -> http://example.com/file.mp4
        # useful for mimetypes to work
        parsed_url = urlparse(url)
        new_url = urlunparse(parsed_url._replace(query=''))
        return new_url

    @staticmethod
    def is_relevant_url(url: str) -> bool:
        """
        Detect if a detected media URL is recurring and therefore irrelevant to a specific archive. Useful, for example, for the enumeration of the media files in WARC files which include profile pictures, favicons, etc.
        """
        clean_url = UrlUtil.remove_get_parameters(url)

        # favicons
        if "favicon" in url: return False
        # ifnore icons
        if clean_url.endswith(".ico"): return False
        # ignore SVGs
        if UrlUtil.remove_get_parameters(url).endswith(".svg"): return False

        # twitter profile pictures
        if "twimg.com/profile_images" in url: return False
        return True