Sniff filetype of downloaded media and add extension

Also download in chunks - fixes 2 x TODOs
pull/175/head
Patrick Robertson 2025-01-15 17:02:19 +01:00
rodzic eebd040e13
commit c3dd19f309
3 zmienionych plików z 41 dodań i 11 usunięć

16
poetry.lock wygenerowano
Wyświetl plik

@ -1,4 +1,4 @@
# This file is automatically @generated by Poetry 2.0.0 and should not be changed by hand. # This file is automatically @generated by Poetry 2.0.1 and should not be changed by hand.
[[package]] [[package]]
name = "aiohappyeyeballs" name = "aiohappyeyeballs"
@ -889,6 +889,18 @@ future = "*"
[package.extras] [package.extras]
dev = ["Sphinx (==2.1.0)", "future (==0.17.1)", "numpy (==1.16.4)", "pytest (==4.6.1)", "pytest-mock (==1.10.4)", "tox (==3.12.1)"] dev = ["Sphinx (==2.1.0)", "future (==0.17.1)", "numpy (==1.16.4)", "pytest (==4.6.1)", "pytest-mock (==1.10.4)", "tox (==3.12.1)"]
[[package]]
name = "filetype"
version = "1.2.0"
description = "Infer file type and MIME type of any file/buffer. No external dependencies."
optional = false
python-versions = "*"
groups = ["main"]
files = [
{file = "filetype-1.2.0-py2.py3-none-any.whl", hash = "sha256:7ce71b6880181241cf7ac8697a2f1eb6a8bd9b429f7ad6d27b8db9ba5f1c2d25"},
{file = "filetype-1.2.0.tar.gz", hash = "sha256:66b56cd6474bf41d8c54660347d37afcc3f7d1970648de365c102ef77548aadb"},
]
[[package]] [[package]]
name = "flask" name = "flask"
version = "3.1.0" version = "3.1.0"
@ -3296,4 +3308,4 @@ test = ["pytest (>=8.1,<9.0)"]
[metadata] [metadata]
lock-version = "2.1" lock-version = "2.1"
python-versions = ">=3.10,<3.13" python-versions = ">=3.10,<3.13"
content-hash = "7c7dc6d26e5af1c9bb6e4393b4ac64b155049d20a9f5317baec48c964a2708ac" content-hash = "df1bd49271b2682b82da437c2e6ce3842d116aa0fc7769e9ab9958c91a8647b2"

Wyświetl plik

@ -59,7 +59,8 @@ dependencies = [
"retrying (>=0.0.0)", "retrying (>=0.0.0)",
"tsp-client (>=0.0.0)", "tsp-client (>=0.0.0)",
"certvalidator (>=0.0.0)", "certvalidator (>=0.0.0)",
"toml (>=0.10.2,<0.11.0)" "toml (>=0.10.2,<0.11.0)",
"filetype (>=1.2.0,<2.0.0)"
] ]
[tool.poetry.group.dev.dependencies] [tool.poetry.group.dev.dependencies]

Wyświetl plik

@ -1,6 +1,8 @@
from __future__ import annotations from __future__ import annotations
from pathlib import Path
from abc import abstractmethod from abc import abstractmethod
from dataclasses import dataclass from dataclasses import dataclass
import filetype
import os import os
import mimetypes, requests import mimetypes, requests
from loguru import logger from loguru import logger
@ -48,8 +50,6 @@ class Archiver(Step):
""" """
downloads a URL to provided filename, or inferred from URL, returns local filename downloads a URL to provided filename, or inferred from URL, returns local filename
""" """
# TODO: should we refactor to use requests.get(url, stream=True) and write to file in chunks? compare approaches
# TODO: should we guess the extension?
if not to_filename: if not to_filename:
to_filename = url.split('/')[-1].split('?')[0] to_filename = url.split('/')[-1].split('?')[0]
if len(to_filename) > 64: if len(to_filename) > 64:
@ -59,11 +59,28 @@ class Archiver(Step):
headers = { headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36' 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'
} }
d = requests.get(url, headers=headers) try:
assert d.status_code == 200, f"got response code {d.status_code} for {url=}" d = requests.get(url, stream=True, headers=headers)
d.raise_for_status()
# Peek at the first 256 bytes
first_256 = d.raw.read(256)
# Use filetype to guess the extension if there isn't already one
if not Path(to_filename).suffix:
guessed = filetype.guess(first_256)
extension = guessed.extension if guessed else None
if extension:
to_filename += f".{extension}"
with open(to_filename, 'wb') as f: with open(to_filename, 'wb') as f:
f.write(d.content) f.write(first_256)
for chunk in d.iter_content(chunk_size=8192):
f.write(chunk)
return to_filename return to_filename
except requests.RequestException as e:
logger.warning(f"Failed to fetch the Media URL: {e}")
@abstractmethod @abstractmethod
def download(self, item: Metadata) -> Metadata: pass def download(self, item: Metadata) -> Metadata: pass