kopia lustrzana https://github.com/bellingcat/auto-archiver
Unit tests for url utils
rodzic
42e16aebd6
commit
168dfb6254
|
@ -4,8 +4,8 @@ from ipaddress import ip_address
|
||||||
|
|
||||||
|
|
||||||
AUTHWALL_URLS = [
|
AUTHWALL_URLS = [
|
||||||
re.compile(r"https:\/\/t\.me(\/c)\/(.+)\/(\d+)"), # telegram private channels
|
re.compile(r"https?:\/\/t\.me(\/c)\/(.+)\/(\d+)"), # telegram private channels
|
||||||
re.compile(r"https:\/\/www\.instagram\.com"), # instagram
|
re.compile(r"https?:\/\/(www\.)?instagram\.com"), # instagram
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
@ -81,56 +81,43 @@ def is_relevant_url(url: str) -> bool:
|
||||||
"""
|
"""
|
||||||
clean_url = remove_get_parameters(url)
|
clean_url = remove_get_parameters(url)
|
||||||
|
|
||||||
# favicons
|
IRRELEVANT_URLS = [
|
||||||
if "favicon" in url:
|
# favicons
|
||||||
return False
|
("favicon",),
|
||||||
# ifnore icons
|
# twitter profile pictures
|
||||||
if clean_url.endswith(".ico"):
|
("twimg.com/profile_images",),
|
||||||
return False
|
("twimg.com", "default_profile_images"),
|
||||||
# ignore SVGs
|
# instagram profile pictures
|
||||||
if remove_get_parameters(url).endswith(".svg"):
|
("https://scontent.cdninstagram.com/", "150x150"),
|
||||||
return False
|
# instagram recurring images
|
||||||
|
("https://static.cdninstagram.com/rsrc.php/",),
|
||||||
|
# telegram
|
||||||
|
("https://telegram.org/img/emoji/",),
|
||||||
|
# youtube
|
||||||
|
("https://www.youtube.com/s/gaming/emoji/",),
|
||||||
|
("https://yt3.ggpht.com", "default-user="),
|
||||||
|
("https://www.youtube.com/s/search/audio/",),
|
||||||
|
# ok
|
||||||
|
("https://ok.ru/res/i/",),
|
||||||
|
("https://vk.com/emoji/",),
|
||||||
|
("vk.com/images/",),
|
||||||
|
("vk.com/images/reaction/",),
|
||||||
|
# wikipedia
|
||||||
|
("wikipedia.org/static",),
|
||||||
|
]
|
||||||
|
|
||||||
# twitter profile pictures
|
IRRELEVANT_ENDS_WITH = [
|
||||||
if "twimg.com/profile_images" in url:
|
".svg", # ignore SVGs
|
||||||
return False
|
".ico", # ignore icons
|
||||||
if "twimg.com" in url and "/default_profile_images" in url:
|
]
|
||||||
return False
|
|
||||||
|
|
||||||
# instagram profile pictures
|
for end in IRRELEVANT_ENDS_WITH:
|
||||||
if "https://scontent.cdninstagram.com/" in url and "150x150" in url:
|
if clean_url.endswith(end):
|
||||||
return False
|
return False
|
||||||
# instagram recurring images
|
|
||||||
if "https://static.cdninstagram.com/rsrc.php/" in url:
|
|
||||||
return False
|
|
||||||
|
|
||||||
# telegram
|
for parts in IRRELEVANT_URLS:
|
||||||
if "https://telegram.org/img/emoji/" in url:
|
if all(part in clean_url for part in parts):
|
||||||
return False
|
return False
|
||||||
|
|
||||||
# youtube
|
|
||||||
if "https://www.youtube.com/s/gaming/emoji/" in url:
|
|
||||||
return False
|
|
||||||
if "https://yt3.ggpht.com" in url and "default-user=" in url:
|
|
||||||
return False
|
|
||||||
if "https://www.youtube.com/s/search/audio/" in url:
|
|
||||||
return False
|
|
||||||
|
|
||||||
# ok
|
|
||||||
if " https://ok.ru/res/i/" in url:
|
|
||||||
return False
|
|
||||||
|
|
||||||
# vk
|
|
||||||
if "https://vk.com/emoji/" in url:
|
|
||||||
return False
|
|
||||||
if "vk.com/images/" in url:
|
|
||||||
return False
|
|
||||||
if "vk.com/images/reaction/" in url:
|
|
||||||
return False
|
|
||||||
|
|
||||||
# wikipedia
|
|
||||||
if "wikipedia.org/static" in url:
|
|
||||||
return False
|
|
||||||
|
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,143 @@
|
||||||
|
import pytest
|
||||||
|
from auto_archiver.utils.url import (
|
||||||
|
is_auth_wall,
|
||||||
|
check_url_or_raise,
|
||||||
|
domain_for_url,
|
||||||
|
is_relevant_url,
|
||||||
|
remove_get_parameters,
|
||||||
|
twitter_best_quality_url,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"url, is_auth",
|
||||||
|
[
|
||||||
|
("https://example.com", False),
|
||||||
|
("https://t.me/c/abc/123", True),
|
||||||
|
("https://t.me/not-private/", False),
|
||||||
|
("https://instagram.com", True),
|
||||||
|
("https://www.instagram.com", True),
|
||||||
|
("https://www.instagram.com/p/INVALID", True),
|
||||||
|
("https://www.instagram.com/p/C4QgLbrIKXG/", True),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_is_auth_wall(url, is_auth):
|
||||||
|
assert is_auth_wall(url) == is_auth
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"url, raises",
|
||||||
|
[
|
||||||
|
("http://example.com", False),
|
||||||
|
("https://example.com", False),
|
||||||
|
("ftp://example.com", True),
|
||||||
|
("http://localhost", True),
|
||||||
|
("http://", True),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_check_url_or_raise(url, raises):
|
||||||
|
if raises:
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
check_url_or_raise(url)
|
||||||
|
else:
|
||||||
|
assert check_url_or_raise(url)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"url, domain",
|
||||||
|
[
|
||||||
|
("https://example.com", "example.com"),
|
||||||
|
("https://www.example.com", "www.example.com"),
|
||||||
|
("https://www.example.com/path", "www.example.com"),
|
||||||
|
("https://", ""),
|
||||||
|
("http://localhost", "localhost"),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_domain_for_url(url, domain):
|
||||||
|
assert domain_for_url(url) == domain
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"url, without_get",
|
||||||
|
[
|
||||||
|
("https://example.com", "https://example.com"),
|
||||||
|
("https://example.com?utm_source=example", "https://example.com"),
|
||||||
|
("https://example.com?utm_source=example&other=1", "https://example.com"),
|
||||||
|
("https://example.com/something", "https://example.com/something"),
|
||||||
|
("https://example.com/something?utm_source=example", "https://example.com/something"),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_remove_get_parameters(url, without_get):
|
||||||
|
assert remove_get_parameters(url) == without_get
|
||||||
|
|
||||||
|
# IRRELEVANT_URLS = [
|
||||||
|
# # favicons
|
||||||
|
# ("favicon",),
|
||||||
|
# # twitter profile pictures
|
||||||
|
# ("twimg.com/profile_images",),
|
||||||
|
# ("twimg.com", "default_profile_images"),
|
||||||
|
# # instagram profile pictures
|
||||||
|
# ("https://scontent.cdninstagram.com/", "150x150"),
|
||||||
|
# # instagram recurring images
|
||||||
|
# ("https://static.cdninstagram.com/rsrc.php/",),
|
||||||
|
# # telegram
|
||||||
|
# ("https://telegram.org/img/emoji/",),
|
||||||
|
# # youtube
|
||||||
|
# ("https://www.youtube.com/s/gaming/emoji/",),
|
||||||
|
# ("https://yt3.ggpht.com", "default-user="),
|
||||||
|
# ("https://www.youtube.com/s/search/audio/",),
|
||||||
|
# # ok
|
||||||
|
# ("https://ok.ru/res/i/",),
|
||||||
|
# ("https://vk.com/emoji/",),
|
||||||
|
# ("vk.com/images/",),
|
||||||
|
# ("vk.com/images/reaction/",),
|
||||||
|
# # wikipedia
|
||||||
|
# ("wikipedia.org/static",),
|
||||||
|
# ]
|
||||||
|
|
||||||
|
# IRRELEVANT_ENDS_WITH = [
|
||||||
|
# ".svg", # ignore SVGs
|
||||||
|
# ".ico", # ignore icons
|
||||||
|
# ]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"url, relevant",
|
||||||
|
[
|
||||||
|
("https://example.com", True),
|
||||||
|
("https://example.com/favicon.ico", False),
|
||||||
|
("https://twimg.com/profile_images", False),
|
||||||
|
("https://twimg.com/something/default_profile_images", False),
|
||||||
|
("https://scontent.cdninstagram.com/username/150x150.jpg", False),
|
||||||
|
("https://static.cdninstagram.com/rsrc.php/", False),
|
||||||
|
("https://telegram.org/img/emoji/", False),
|
||||||
|
("https://www.youtube.com/s/gaming/emoji/", False),
|
||||||
|
("https://yt3.ggpht.com/default-user=", False),
|
||||||
|
("https://www.youtube.com/s/search/audio/", False),
|
||||||
|
("https://ok.ru/res/i/", False),
|
||||||
|
("https://vk.com/emoji/", False),
|
||||||
|
("https://vk.com/images/", False),
|
||||||
|
("https://vk.com/images/reaction/", False),
|
||||||
|
("https://wikipedia.org/static", False),
|
||||||
|
("https://example.com/file.svg", False),
|
||||||
|
("https://example.com/file.ico", False),
|
||||||
|
("https://example.com/file.mp4", True),
|
||||||
|
("https://example.com/150x150.jpg", True),
|
||||||
|
("https://example.com/rsrc.php/", True),
|
||||||
|
("https://example.com/img/emoji/", True),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_is_relevant_url(url, relevant):
|
||||||
|
assert is_relevant_url(url) == relevant
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"url, best_quality",
|
||||||
|
[
|
||||||
|
("https://twitter.com/some_image.jpg?name=small", "https://twitter.com/some_image.jpg?name=orig"),
|
||||||
|
("https://twitter.com/some_image.jpg", "https://twitter.com/some_image.jpg"),
|
||||||
|
("https://twitter.com/some_image.jpg?name=orig", "https://twitter.com/some_image.jpg?name=orig"),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_twitter_best_quality_url(url, best_quality):
|
||||||
|
assert twitter_best_quality_url(url) == best_quality
|
Ładowanie…
Reference in New Issue