adds new URLs to ignore

pull/318/head
msramalho 2025-06-10 15:44:52 +01:00
rodzic 4cfbc3008b
commit c815488daa
Nie znaleziono w bazie danych klucza dla tego podpisu
2 zmienionych plików z 13 dodań i 0 usunięć

Wyświetl plik

@ -78,6 +78,8 @@ def remove_get_parameters(url: str) -> str:
def is_relevant_url(url: str) -> bool:
"""
Detect if a detected media URL is recurring and therefore irrelevant to a specific archive. Useful, for example, for the enumeration of the media files in WARC files which include profile pictures, favicons, etc.
Assumption: URLs are relevant if they refer to files that can be downloaded with curl/requests, so excludes extensions like .m3u8.
"""
clean_url = remove_get_parameters(url)
@ -104,11 +106,17 @@ def is_relevant_url(url: str) -> bool:
("vk.com/images/reaction/",),
# wikipedia
("wikipedia.org/static",),
# reddit
("styles.redditmedia.com",), # opinionated but excludes may irrelevant images like avatars and banners
("emoji.redditmedia.com",),
]
IRRELEVANT_ENDS_WITH = [
".svg", # ignore SVGs
".ico", # ignore icons
".m3u8",
".mpd",
".ism", # ignore index files for videos, these should be handled by ytdlp
]
for end in IRRELEVANT_ENDS_WITH:

Wyświetl plik

@ -95,6 +95,11 @@ def test_remove_get_parameters(url, without_get):
("https://example.com/150x150.jpg", True),
("https://example.com/rsrc.php/", True),
("https://example.com/img/emoji/", True),
("https://styles.redditmedia.com/123", False),
("https://emoji.redditmedia.com/abc.jpg", False),
("https://example.com/rsrc.m3u8?asdasd=10", False),
("https://example.com/rsrc.mpd", False),
("https://example.com/rsrc.ism?vid=12", False),
],
)
def test_is_relevant_url(url, relevant):