diff --git a/src/auto_archiver/utils/url.py b/src/auto_archiver/utils/url.py index 368d93c..9d7730b 100644 --- a/src/auto_archiver/utils/url.py +++ b/src/auto_archiver/utils/url.py @@ -78,6 +78,8 @@ def remove_get_parameters(url: str) -> str: def is_relevant_url(url: str) -> bool: """ Detect if a detected media URL is recurring and therefore irrelevant to a specific archive. Useful, for example, for the enumeration of the media files in WARC files which include profile pictures, favicons, etc. + + Assumption: URLs are relevant if they refer to files that can be downloaded with curl/requests, so excludes extensions like .m3u8. """ clean_url = remove_get_parameters(url) @@ -104,11 +106,17 @@ def is_relevant_url(url: str) -> bool: ("vk.com/images/reaction/",), # wikipedia ("wikipedia.org/static",), + # reddit + ("styles.redditmedia.com",), # opinionated but excludes may irrelevant images like avatars and banners + ("emoji.redditmedia.com",), ] IRRELEVANT_ENDS_WITH = [ ".svg", # ignore SVGs ".ico", # ignore icons + ".m3u8", + ".mpd", + ".ism", # ignore index files for videos, these should be handled by ytdlp ] for end in IRRELEVANT_ENDS_WITH: diff --git a/tests/utils/test_urls.py b/tests/utils/test_urls.py index 7871847..2fb66a5 100644 --- a/tests/utils/test_urls.py +++ b/tests/utils/test_urls.py @@ -95,6 +95,11 @@ def test_remove_get_parameters(url, without_get): ("https://example.com/150x150.jpg", True), ("https://example.com/rsrc.php/", True), ("https://example.com/img/emoji/", True), + ("https://styles.redditmedia.com/123", False), + ("https://emoji.redditmedia.com/abc.jpg", False), + ("https://example.com/rsrc.m3u8?asdasd=10", False), + ("https://example.com/rsrc.mpd", False), + ("https://example.com/rsrc.ism?vid=12", False), ], ) def test_is_relevant_url(url, relevant):