diff --git a/src/auto_archiver/utils/url.py b/src/auto_archiver/utils/url.py index 9d7730b..ea03d7f 100644 --- a/src/auto_archiver/utils/url.py +++ b/src/auto_archiver/utils/url.py @@ -133,6 +133,36 @@ def is_relevant_url(url: str) -> bool: def twitter_best_quality_url(url: str) -> str: """ some twitter image URLs point to a less-than best quality - this returns the URL pointing to the highest (original) quality + this returns the URL pointing to the highest (original) quality (with 'name=orig') """ - return re.sub(r"name=(\w+)", "name=orig", url, 1) + parsed = urlparse(url) + query = parsed.query + if "name=" in query: + # Replace only the first occurrence of name=xxx with name=orig + new_query = re.sub(r"name=[^&]*", "name=orig", query, 1) + parsed = parsed._replace(query=new_query) + return urlunparse(parsed) + return url + + +def get_media_url_best_quality(url: str) -> str: + """ + Returns the best quality URL for the given media URL, it may not exist. + """ + parsed = urlparse(url) + + # twitter case + if any(d in parsed.netloc.replace("www", "") for d in ("twitter.com", "twimg.com", "x.com")): + url = twitter_best_quality_url(url) + parsed = urlparse(url) + + # some cases https://example.com/media-1280x720.mp4 to https://example.com/media.mp4 + basename = parsed.path.split("/")[-1] + match = re.match(r"(.+)-\d+x\d+(\.[a-zA-Z0-9]+)$", basename) + if match: + orig_basename = match.group(1) + match.group(2) + new_path = "/".join(parsed.path.split("/")[:-1] + [orig_basename]) + parsed = parsed._replace(path=new_path) # keep the query unchanged + url = urlunparse(parsed) + + return url diff --git a/tests/utils/test_urls.py b/tests/utils/test_urls.py index 2fb66a5..df8e0f3 100644 --- a/tests/utils/test_urls.py +++ b/tests/utils/test_urls.py @@ -6,6 +6,7 @@ from auto_archiver.utils.url import ( is_relevant_url, remove_get_parameters, twitter_best_quality_url, + get_media_url_best_quality, ) @@ -109,10 +110,51 @@ def test_is_relevant_url(url, relevant): @pytest.mark.parametrize( "url, best_quality", [ - ("https://twitter.com/some_image.jpg?name=small", "https://twitter.com/some_image.jpg?name=orig"), + ( + "https://twitter.com/some_image.jpg?name=small&this_is_another=145", + "https://twitter.com/some_image.jpg?name=orig&this_is_another=145", + ), ("https://twitter.com/some_image.jpg", "https://twitter.com/some_image.jpg"), ("https://twitter.com/some_image.jpg?name=orig", "https://twitter.com/some_image.jpg?name=orig"), ], ) def test_twitter_best_quality_url(url, best_quality): assert twitter_best_quality_url(url) == best_quality + + +@pytest.mark.parametrize( + "input_url,expected_url", + [ + # Twitter: add/replace name= to name=orig + ( + "https://pbs.twimg.com/media/abc123?format=jpg&name=small", + "https://pbs.twimg.com/media/abc123?format=jpg&name=orig", + ), + ("https://pbs.twimg.com/media/abc123?name=large", "https://pbs.twimg.com/media/abc123?name=orig"), + ("https://pbs.twimg.com/media/abc123?format=jpg", "https://pbs.twimg.com/media/abc123?format=jpg"), + # Twitter: already orig + ( + "https://pbs.twimg.com/media/abc123?format=jpg&name=orig", + "https://pbs.twimg.com/media/abc123?format=jpg&name=orig", + ), + # X.com domain + ("https://x.com/media/abc123?name=medium", "https://x.com/media/abc123?name=orig"), + # twimg.com domain + ("https://twimg.com/media/abc123?name=thumb", "https://twimg.com/media/abc123?name=orig"), + # Non-twitter domain, no change + ("https://example.com/media/file.mp4", "https://example.com/media/file.mp4"), + # Remove -WxH from basename + ("https://example.com/media/file-1280x720.mp4", "https://example.com/media/file.mp4"), + ("https://example.com/media/file-1920x1080.jpg?foo=bar", "https://example.com/media/file.jpg?foo=bar"), + # Both twitter and -WxH + ("https://pbs.twimg.com/media/abc-1280x720.jpg?name=small", "https://pbs.twimg.com/media/abc.jpg?name=orig"), + # No match for -WxH, no change + ("https://example.com/media/file.mp4?foo=bar", "https://example.com/media/file.mp4?foo=bar"), + # Path with multiple directories + ("https://example.com/a/b/c/file-640x480.png", "https://example.com/a/b/c/file.png"), + # -WxH in directory, not basename (should not change) + ("https://example.com/media-1280x720/file.mp4", "https://example.com/media-1280x720/file.mp4"), + ], +) +def test_get_media_url_best_quality(input_url, expected_url): + assert get_media_url_best_quality(input_url) == expected_url