diff --git a/src/auto_archiver/utils/url.py b/src/auto_archiver/utils/url.py index 169ed87..368d93c 100644 --- a/src/auto_archiver/utils/url.py +++ b/src/auto_archiver/utils/url.py @@ -4,8 +4,8 @@ from ipaddress import ip_address AUTHWALL_URLS = [ - re.compile(r"https:\/\/t\.me(\/c)\/(.+)\/(\d+)"), # telegram private channels - re.compile(r"https:\/\/www\.instagram\.com"), # instagram + re.compile(r"https?:\/\/t\.me(\/c)\/(.+)\/(\d+)"), # telegram private channels + re.compile(r"https?:\/\/(www\.)?instagram\.com"), # instagram ] @@ -81,56 +81,43 @@ def is_relevant_url(url: str) -> bool: """ clean_url = remove_get_parameters(url) - # favicons - if "favicon" in url: - return False - # ifnore icons - if clean_url.endswith(".ico"): - return False - # ignore SVGs - if remove_get_parameters(url).endswith(".svg"): - return False + IRRELEVANT_URLS = [ + # favicons + ("favicon",), + # twitter profile pictures + ("twimg.com/profile_images",), + ("twimg.com", "default_profile_images"), + # instagram profile pictures + ("https://scontent.cdninstagram.com/", "150x150"), + # instagram recurring images + ("https://static.cdninstagram.com/rsrc.php/",), + # telegram + ("https://telegram.org/img/emoji/",), + # youtube + ("https://www.youtube.com/s/gaming/emoji/",), + ("https://yt3.ggpht.com", "default-user="), + ("https://www.youtube.com/s/search/audio/",), + # ok + ("https://ok.ru/res/i/",), + ("https://vk.com/emoji/",), + ("vk.com/images/",), + ("vk.com/images/reaction/",), + # wikipedia + ("wikipedia.org/static",), + ] - # twitter profile pictures - if "twimg.com/profile_images" in url: - return False - if "twimg.com" in url and "/default_profile_images" in url: - return False + IRRELEVANT_ENDS_WITH = [ + ".svg", # ignore SVGs + ".ico", # ignore icons + ] - # instagram profile pictures - if "https://scontent.cdninstagram.com/" in url and "150x150" in url: - return False - # instagram recurring images - if "https://static.cdninstagram.com/rsrc.php/" in url: - return False + for end in IRRELEVANT_ENDS_WITH: + if clean_url.endswith(end): + return False - # telegram - if "https://telegram.org/img/emoji/" in url: - return False - - # youtube - if "https://www.youtube.com/s/gaming/emoji/" in url: - return False - if "https://yt3.ggpht.com" in url and "default-user=" in url: - return False - if "https://www.youtube.com/s/search/audio/" in url: - return False - - # ok - if " https://ok.ru/res/i/" in url: - return False - - # vk - if "https://vk.com/emoji/" in url: - return False - if "vk.com/images/" in url: - return False - if "vk.com/images/reaction/" in url: - return False - - # wikipedia - if "wikipedia.org/static" in url: - return False + for parts in IRRELEVANT_URLS: + if all(part in clean_url for part in parts): + return False return True diff --git a/tests/utils/test_urls.py b/tests/utils/test_urls.py new file mode 100644 index 0000000..81600ce --- /dev/null +++ b/tests/utils/test_urls.py @@ -0,0 +1,143 @@ +import pytest +from auto_archiver.utils.url import ( + is_auth_wall, + check_url_or_raise, + domain_for_url, + is_relevant_url, + remove_get_parameters, + twitter_best_quality_url, +) + + +@pytest.mark.parametrize( + "url, is_auth", + [ + ("https://example.com", False), + ("https://t.me/c/abc/123", True), + ("https://t.me/not-private/", False), + ("https://instagram.com", True), + ("https://www.instagram.com", True), + ("https://www.instagram.com/p/INVALID", True), + ("https://www.instagram.com/p/C4QgLbrIKXG/", True), + ], +) +def test_is_auth_wall(url, is_auth): + assert is_auth_wall(url) == is_auth + + +@pytest.mark.parametrize( + "url, raises", + [ + ("http://example.com", False), + ("https://example.com", False), + ("ftp://example.com", True), + ("http://localhost", True), + ("http://", True), + ], +) +def test_check_url_or_raise(url, raises): + if raises: + with pytest.raises(ValueError): + check_url_or_raise(url) + else: + assert check_url_or_raise(url) + + +@pytest.mark.parametrize( + "url, domain", + [ + ("https://example.com", "example.com"), + ("https://www.example.com", "www.example.com"), + ("https://www.example.com/path", "www.example.com"), + ("https://", ""), + ("http://localhost", "localhost"), + ], +) +def test_domain_for_url(url, domain): + assert domain_for_url(url) == domain + + +@pytest.mark.parametrize( + "url, without_get", + [ + ("https://example.com", "https://example.com"), + ("https://example.com?utm_source=example", "https://example.com"), + ("https://example.com?utm_source=example&other=1", "https://example.com"), + ("https://example.com/something", "https://example.com/something"), + ("https://example.com/something?utm_source=example", "https://example.com/something"), + ], +) +def test_remove_get_parameters(url, without_get): + assert remove_get_parameters(url) == without_get + + # IRRELEVANT_URLS = [ + # # favicons + # ("favicon",), + # # twitter profile pictures + # ("twimg.com/profile_images",), + # ("twimg.com", "default_profile_images"), + # # instagram profile pictures + # ("https://scontent.cdninstagram.com/", "150x150"), + # # instagram recurring images + # ("https://static.cdninstagram.com/rsrc.php/",), + # # telegram + # ("https://telegram.org/img/emoji/",), + # # youtube + # ("https://www.youtube.com/s/gaming/emoji/",), + # ("https://yt3.ggpht.com", "default-user="), + # ("https://www.youtube.com/s/search/audio/",), + # # ok + # ("https://ok.ru/res/i/",), + # ("https://vk.com/emoji/",), + # ("vk.com/images/",), + # ("vk.com/images/reaction/",), + # # wikipedia + # ("wikipedia.org/static",), + # ] + + # IRRELEVANT_ENDS_WITH = [ + # ".svg", # ignore SVGs + # ".ico", # ignore icons + # ] + + +@pytest.mark.parametrize( + "url, relevant", + [ + ("https://example.com", True), + ("https://example.com/favicon.ico", False), + ("https://twimg.com/profile_images", False), + ("https://twimg.com/something/default_profile_images", False), + ("https://scontent.cdninstagram.com/username/150x150.jpg", False), + ("https://static.cdninstagram.com/rsrc.php/", False), + ("https://telegram.org/img/emoji/", False), + ("https://www.youtube.com/s/gaming/emoji/", False), + ("https://yt3.ggpht.com/default-user=", False), + ("https://www.youtube.com/s/search/audio/", False), + ("https://ok.ru/res/i/", False), + ("https://vk.com/emoji/", False), + ("https://vk.com/images/", False), + ("https://vk.com/images/reaction/", False), + ("https://wikipedia.org/static", False), + ("https://example.com/file.svg", False), + ("https://example.com/file.ico", False), + ("https://example.com/file.mp4", True), + ("https://example.com/150x150.jpg", True), + ("https://example.com/rsrc.php/", True), + ("https://example.com/img/emoji/", True), + ], +) +def test_is_relevant_url(url, relevant): + assert is_relevant_url(url) == relevant + + +@pytest.mark.parametrize( + "url, best_quality", + [ + ("https://twitter.com/some_image.jpg?name=small", "https://twitter.com/some_image.jpg?name=orig"), + ("https://twitter.com/some_image.jpg", "https://twitter.com/some_image.jpg"), + ("https://twitter.com/some_image.jpg?name=orig", "https://twitter.com/some_image.jpg?name=orig"), + ], +) +def test_twitter_best_quality_url(url, best_quality): + assert twitter_best_quality_url(url) == best_quality