Sniff filetype of downloaded media and add extension

Also download in chunks - fixes 2 x TODOs
2025-01-15 17:02:19 +01:00 · 2025-01-15 17:02:19 +01:00 · c3dd19f309
commit c3dd19f309
--- a/poetry.lock
+++ b/poetry.lock
@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 2.0.0 and should not be changed by hand.
+# This file is automatically @generated by Poetry 2.0.1 and should not be changed by hand.

 [[package]]
 name = "aiohappyeyeballs"
@ -889,6 +889,18 @@ future = "*"
 [package.extras]
 dev = ["Sphinx (==2.1.0)", "future (==0.17.1)", "numpy (==1.16.4)", "pytest (==4.6.1)", "pytest-mock (==1.10.4)", "tox (==3.12.1)"]

+[[package]]
+name = "filetype"
+version = "1.2.0"
+description = "Infer file type and MIME type of any file/buffer. No external dependencies."
+optional = false
+python-versions = "*"
+groups = ["main"]
+files = [
+    {file = "filetype-1.2.0-py2.py3-none-any.whl", hash = "sha256:7ce71b6880181241cf7ac8697a2f1eb6a8bd9b429f7ad6d27b8db9ba5f1c2d25"},
+    {file = "filetype-1.2.0.tar.gz", hash = "sha256:66b56cd6474bf41d8c54660347d37afcc3f7d1970648de365c102ef77548aadb"},
+]
+
 [[package]]
 name = "flask"
 version = "3.1.0"
@ -3296,4 +3308,4 @@ test = ["pytest (>=8.1,<9.0)"]
 [metadata]
 lock-version = "2.1"
 python-versions = ">=3.10,<3.13"
-content-hash = "7c7dc6d26e5af1c9bb6e4393b4ac64b155049d20a9f5317baec48c964a2708ac"
+content-hash = "df1bd49271b2682b82da437c2e6ce3842d116aa0fc7769e9ab9958c91a8647b2"
--- a/pyproject.toml
+++ b/pyproject.toml
@ -59,7 +59,8 @@ dependencies = [
    "retrying (>=0.0.0)",
    "tsp-client (>=0.0.0)",
    "certvalidator (>=0.0.0)",
-    "toml (>=0.10.2,<0.11.0)"
+    "toml (>=0.10.2,<0.11.0)",
+    "filetype (>=1.2.0,<2.0.0)"
 ]

 [tool.poetry.group.dev.dependencies]
--- a/src/auto_archiver/archivers/archiver.py
+++ b/src/auto_archiver/archivers/archiver.py
@ -1,6 +1,8 @@
 from __future__ import annotations
+from pathlib import Path
 from abc import abstractmethod
 from dataclasses import dataclass
+import filetype
 import os
 import mimetypes, requests
 from loguru import logger
@ -46,10 +48,8 @@ class Archiver(Step):
    @retry(wait_random_min=500, wait_random_max=3500, stop_max_attempt_number=5)
    def download_from_url(self, url: str, to_filename: str = None, verbose=True) -> str:
        """
-        downloads a URL to provided filename, or inferred from URL, returns local filename
+            downloads a URL to provided filename, or inferred from URL, returns local filename
        """
-        # TODO: should we refactor to use requests.get(url, stream=True) and write to file in chunks? compare approaches
-        # TODO: should we guess the extension?
        if not to_filename:
            to_filename = url.split('/')[-1].split('?')[0]
            if len(to_filename) > 64:
@ -59,11 +59,28 @@ class Archiver(Step):
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'
        }
-        d = requests.get(url, headers=headers)
-        assert d.status_code == 200, f"got response code {d.status_code} for {url=}"
-        with open(to_filename, 'wb') as f:
-            f.write(d.content)
-        return to_filename
+        try:
+            d = requests.get(url, stream=True, headers=headers)
+            d.raise_for_status()
+
+            # Peek at the first 256 bytes
+            first_256 = d.raw.read(256)
+
+            # Use filetype to guess the extension if there isn't already one
+            if not Path(to_filename).suffix:
+                guessed = filetype.guess(first_256)
+                extension = guessed.extension if guessed else None
+                if extension:
+                    to_filename += f".{extension}"
+
+            with open(to_filename, 'wb') as f:
+                f.write(first_256)
+                for chunk in d.iter_content(chunk_size=8192):
+                    f.write(chunk)
+            return to_filename
+        
+        except requests.RequestException as e:
+            logger.warning(f"Failed to fetch the Media URL: {e}")

    @abstractmethod
    def download(self, item: Metadata) -> Metadata: pass