improves twitter URL cleaning and introduces another bestquality check

2025-06-10 16:09:38 +01:00 · 2025-06-10 16:09:38 +01:00 · 287e823f43
commit 287e823f43
--- a/src/auto_archiver/utils/url.py
+++ b/src/auto_archiver/utils/url.py
@ -133,6 +133,36 @@ def is_relevant_url(url: str) -> bool:
 def twitter_best_quality_url(url: str) -> str:
    """
    some twitter image URLs point to a less-than best quality
-    this returns the URL pointing to the highest (original) quality
+    this returns the URL pointing to the highest (original) quality (with 'name=orig')
    """
-    return re.sub(r"name=(\w+)", "name=orig", url, 1)
+    parsed = urlparse(url)
+    query = parsed.query
+    if "name=" in query:
+        # Replace only the first occurrence of name=xxx with name=orig
+        new_query = re.sub(r"name=[^&]*", "name=orig", query, 1)
+        parsed = parsed._replace(query=new_query)
+        return urlunparse(parsed)
+    return url
+
+
+def get_media_url_best_quality(url: str) -> str:
+    """
+    Returns the best quality URL for the given media URL, it may not exist.
+    """
+    parsed = urlparse(url)
+
+    # twitter case
+    if any(d in parsed.netloc.replace("www", "") for d in ("twitter.com", "twimg.com", "x.com")):
+        url = twitter_best_quality_url(url)
+        parsed = urlparse(url)
+
+    # some cases https://example.com/media-1280x720.mp4 to https://example.com/media.mp4
+    basename = parsed.path.split("/")[-1]
+    match = re.match(r"(.+)-\d+x\d+(\.[a-zA-Z0-9]+)$", basename)
+    if match:
+        orig_basename = match.group(1) + match.group(2)
+        new_path = "/".join(parsed.path.split("/")[:-1] + [orig_basename])
+        parsed = parsed._replace(path=new_path)  # keep the query unchanged
+        url = urlunparse(parsed)
+
+    return url
--- a/tests/utils/test_urls.py
+++ b/tests/utils/test_urls.py
@ -6,6 +6,7 @@ from auto_archiver.utils.url import (
    is_relevant_url,
    remove_get_parameters,
    twitter_best_quality_url,
+    get_media_url_best_quality,
 )


@ -109,10 +110,51 @@ def test_is_relevant_url(url, relevant):
@pytest.mark.parametrize(
    "url, best_quality",
    [
-        ("https://twitter.com/some_image.jpg?name=small", "https://twitter.com/some_image.jpg?name=orig"),
+        (
+            "https://twitter.com/some_image.jpg?name=small&this_is_another=145",
+            "https://twitter.com/some_image.jpg?name=orig&this_is_another=145",
+        ),
        ("https://twitter.com/some_image.jpg", "https://twitter.com/some_image.jpg"),
        ("https://twitter.com/some_image.jpg?name=orig", "https://twitter.com/some_image.jpg?name=orig"),
    ],
 )
 def test_twitter_best_quality_url(url, best_quality):
    assert twitter_best_quality_url(url) == best_quality
+
+
+@pytest.mark.parametrize(
+    "input_url,expected_url",
+    [
+        # Twitter: add/replace name= to name=orig
+        (
+            "https://pbs.twimg.com/media/abc123?format=jpg&name=small",
+            "https://pbs.twimg.com/media/abc123?format=jpg&name=orig",
+        ),
+        ("https://pbs.twimg.com/media/abc123?name=large", "https://pbs.twimg.com/media/abc123?name=orig"),
+        ("https://pbs.twimg.com/media/abc123?format=jpg", "https://pbs.twimg.com/media/abc123?format=jpg"),
+        # Twitter: already orig
+        (
+            "https://pbs.twimg.com/media/abc123?format=jpg&name=orig",
+            "https://pbs.twimg.com/media/abc123?format=jpg&name=orig",
+        ),
+        # X.com domain
+        ("https://x.com/media/abc123?name=medium", "https://x.com/media/abc123?name=orig"),
+        # twimg.com domain
+        ("https://twimg.com/media/abc123?name=thumb", "https://twimg.com/media/abc123?name=orig"),
+        # Non-twitter domain, no change
+        ("https://example.com/media/file.mp4", "https://example.com/media/file.mp4"),
+        # Remove -WxH from basename
+        ("https://example.com/media/file-1280x720.mp4", "https://example.com/media/file.mp4"),
+        ("https://example.com/media/file-1920x1080.jpg?foo=bar", "https://example.com/media/file.jpg?foo=bar"),
+        # Both twitter and -WxH
+        ("https://pbs.twimg.com/media/abc-1280x720.jpg?name=small", "https://pbs.twimg.com/media/abc.jpg?name=orig"),
+        # No match for -WxH, no change
+        ("https://example.com/media/file.mp4?foo=bar", "https://example.com/media/file.mp4?foo=bar"),
+        # Path with multiple directories
+        ("https://example.com/a/b/c/file-640x480.png", "https://example.com/a/b/c/file.png"),
+        # -WxH in directory, not basename (should not change)
+        ("https://example.com/media-1280x720/file.mp4", "https://example.com/media-1280x720/file.mp4"),
+    ],
+)
+def test_get_media_url_best_quality(input_url, expected_url):
+    assert get_media_url_best_quality(input_url) == expected_url