kopia lustrzana https://github.com/bellingcat/auto-archiver
improves twitter URL cleaning and introduces another bestquality check
rodzic
c815488daa
commit
287e823f43
|
@ -133,6 +133,36 @@ def is_relevant_url(url: str) -> bool:
|
|||
def twitter_best_quality_url(url: str) -> str:
|
||||
"""
|
||||
some twitter image URLs point to a less-than best quality
|
||||
this returns the URL pointing to the highest (original) quality
|
||||
this returns the URL pointing to the highest (original) quality (with 'name=orig')
|
||||
"""
|
||||
return re.sub(r"name=(\w+)", "name=orig", url, 1)
|
||||
parsed = urlparse(url)
|
||||
query = parsed.query
|
||||
if "name=" in query:
|
||||
# Replace only the first occurrence of name=xxx with name=orig
|
||||
new_query = re.sub(r"name=[^&]*", "name=orig", query, 1)
|
||||
parsed = parsed._replace(query=new_query)
|
||||
return urlunparse(parsed)
|
||||
return url
|
||||
|
||||
|
||||
def get_media_url_best_quality(url: str) -> str:
|
||||
"""
|
||||
Returns the best quality URL for the given media URL, it may not exist.
|
||||
"""
|
||||
parsed = urlparse(url)
|
||||
|
||||
# twitter case
|
||||
if any(d in parsed.netloc.replace("www", "") for d in ("twitter.com", "twimg.com", "x.com")):
|
||||
url = twitter_best_quality_url(url)
|
||||
parsed = urlparse(url)
|
||||
|
||||
# some cases https://example.com/media-1280x720.mp4 to https://example.com/media.mp4
|
||||
basename = parsed.path.split("/")[-1]
|
||||
match = re.match(r"(.+)-\d+x\d+(\.[a-zA-Z0-9]+)$", basename)
|
||||
if match:
|
||||
orig_basename = match.group(1) + match.group(2)
|
||||
new_path = "/".join(parsed.path.split("/")[:-1] + [orig_basename])
|
||||
parsed = parsed._replace(path=new_path) # keep the query unchanged
|
||||
url = urlunparse(parsed)
|
||||
|
||||
return url
|
||||
|
|
|
@ -6,6 +6,7 @@ from auto_archiver.utils.url import (
|
|||
is_relevant_url,
|
||||
remove_get_parameters,
|
||||
twitter_best_quality_url,
|
||||
get_media_url_best_quality,
|
||||
)
|
||||
|
||||
|
||||
|
@ -109,10 +110,51 @@ def test_is_relevant_url(url, relevant):
|
|||
@pytest.mark.parametrize(
|
||||
"url, best_quality",
|
||||
[
|
||||
("https://twitter.com/some_image.jpg?name=small", "https://twitter.com/some_image.jpg?name=orig"),
|
||||
(
|
||||
"https://twitter.com/some_image.jpg?name=small&this_is_another=145",
|
||||
"https://twitter.com/some_image.jpg?name=orig&this_is_another=145",
|
||||
),
|
||||
("https://twitter.com/some_image.jpg", "https://twitter.com/some_image.jpg"),
|
||||
("https://twitter.com/some_image.jpg?name=orig", "https://twitter.com/some_image.jpg?name=orig"),
|
||||
],
|
||||
)
|
||||
def test_twitter_best_quality_url(url, best_quality):
|
||||
assert twitter_best_quality_url(url) == best_quality
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"input_url,expected_url",
|
||||
[
|
||||
# Twitter: add/replace name= to name=orig
|
||||
(
|
||||
"https://pbs.twimg.com/media/abc123?format=jpg&name=small",
|
||||
"https://pbs.twimg.com/media/abc123?format=jpg&name=orig",
|
||||
),
|
||||
("https://pbs.twimg.com/media/abc123?name=large", "https://pbs.twimg.com/media/abc123?name=orig"),
|
||||
("https://pbs.twimg.com/media/abc123?format=jpg", "https://pbs.twimg.com/media/abc123?format=jpg"),
|
||||
# Twitter: already orig
|
||||
(
|
||||
"https://pbs.twimg.com/media/abc123?format=jpg&name=orig",
|
||||
"https://pbs.twimg.com/media/abc123?format=jpg&name=orig",
|
||||
),
|
||||
# X.com domain
|
||||
("https://x.com/media/abc123?name=medium", "https://x.com/media/abc123?name=orig"),
|
||||
# twimg.com domain
|
||||
("https://twimg.com/media/abc123?name=thumb", "https://twimg.com/media/abc123?name=orig"),
|
||||
# Non-twitter domain, no change
|
||||
("https://example.com/media/file.mp4", "https://example.com/media/file.mp4"),
|
||||
# Remove -WxH from basename
|
||||
("https://example.com/media/file-1280x720.mp4", "https://example.com/media/file.mp4"),
|
||||
("https://example.com/media/file-1920x1080.jpg?foo=bar", "https://example.com/media/file.jpg?foo=bar"),
|
||||
# Both twitter and -WxH
|
||||
("https://pbs.twimg.com/media/abc-1280x720.jpg?name=small", "https://pbs.twimg.com/media/abc.jpg?name=orig"),
|
||||
# No match for -WxH, no change
|
||||
("https://example.com/media/file.mp4?foo=bar", "https://example.com/media/file.mp4?foo=bar"),
|
||||
# Path with multiple directories
|
||||
("https://example.com/a/b/c/file-640x480.png", "https://example.com/a/b/c/file.png"),
|
||||
# -WxH in directory, not basename (should not change)
|
||||
("https://example.com/media-1280x720/file.mp4", "https://example.com/media-1280x720/file.mp4"),
|
||||
],
|
||||
)
|
||||
def test_get_media_url_best_quality(input_url, expected_url):
|
||||
assert get_media_url_best_quality(input_url) == expected_url
|
||||
|
|
Ładowanie…
Reference in New Issue