kopia lustrzana https://github.com/bellingcat/auto-archiver
adds new URLs to ignore
rodzic
4cfbc3008b
commit
c815488daa
|
@ -78,6 +78,8 @@ def remove_get_parameters(url: str) -> str:
|
|||
def is_relevant_url(url: str) -> bool:
|
||||
"""
|
||||
Detect if a detected media URL is recurring and therefore irrelevant to a specific archive. Useful, for example, for the enumeration of the media files in WARC files which include profile pictures, favicons, etc.
|
||||
|
||||
Assumption: URLs are relevant if they refer to files that can be downloaded with curl/requests, so excludes extensions like .m3u8.
|
||||
"""
|
||||
clean_url = remove_get_parameters(url)
|
||||
|
||||
|
@ -104,11 +106,17 @@ def is_relevant_url(url: str) -> bool:
|
|||
("vk.com/images/reaction/",),
|
||||
# wikipedia
|
||||
("wikipedia.org/static",),
|
||||
# reddit
|
||||
("styles.redditmedia.com",), # opinionated but excludes may irrelevant images like avatars and banners
|
||||
("emoji.redditmedia.com",),
|
||||
]
|
||||
|
||||
IRRELEVANT_ENDS_WITH = [
|
||||
".svg", # ignore SVGs
|
||||
".ico", # ignore icons
|
||||
".m3u8",
|
||||
".mpd",
|
||||
".ism", # ignore index files for videos, these should be handled by ytdlp
|
||||
]
|
||||
|
||||
for end in IRRELEVANT_ENDS_WITH:
|
||||
|
|
|
@ -95,6 +95,11 @@ def test_remove_get_parameters(url, without_get):
|
|||
("https://example.com/150x150.jpg", True),
|
||||
("https://example.com/rsrc.php/", True),
|
||||
("https://example.com/img/emoji/", True),
|
||||
("https://styles.redditmedia.com/123", False),
|
||||
("https://emoji.redditmedia.com/abc.jpg", False),
|
||||
("https://example.com/rsrc.m3u8?asdasd=10", False),
|
||||
("https://example.com/rsrc.mpd", False),
|
||||
("https://example.com/rsrc.ism?vid=12", False),
|
||||
],
|
||||
)
|
||||
def test_is_relevant_url(url, relevant):
|
||||
|
|
Ładowanie…
Reference in New Issue