diff --git a/Dockerfile b/Dockerfile index 5c17e1a..3e1c747 100644 --- a/Dockerfile +++ b/Dockerfile @@ -9,7 +9,7 @@ RUN pip install --upgrade pip && \ pip install pipenv && \ add-apt-repository ppa:mozillateam/ppa && \ apt-get update && \ - apt-get install -y gcc ffmpeg fonts-noto && \ + apt-get install -y gcc ffmpeg fonts-noto exiftool && \ apt-get install -y --no-install-recommends firefox-esr && \ ln -s /usr/bin/firefox-esr /usr/bin/firefox && \ wget https://github.com/mozilla/geckodriver/releases/download/v0.33.0/geckodriver-v0.33.0-linux64.tar.gz && \ diff --git a/example.orchestration.yaml b/example.orchestration.yaml index c0bf3ca..a40e013 100644 --- a/example.orchestration.yaml +++ b/example.orchestration.yaml @@ -15,6 +15,7 @@ steps: # - wacz_archiver_enricher enrichers: - hash_enricher + # - metadata_enricher # - screenshot_enricher # - thumbnail_enricher # - wayback_archiver_enricher diff --git a/src/auto_archiver/enrichers/metadata_enricher.py b/src/auto_archiver/enrichers/metadata_enricher.py new file mode 100644 index 0000000..9fe257e --- /dev/null +++ b/src/auto_archiver/enrichers/metadata_enricher.py @@ -0,0 +1,47 @@ +import subprocess +import traceback +from loguru import logger + +from . import Enricher +from ..core import Metadata + + +class MetadataEnricher(Enricher): + """ + Extracts metadata information from files using exiftool. + """ + name = "metadata_enricher" + + def __init__(self, config: dict) -> None: + # without this STEP.__init__ is not called + super().__init__(config) + + @staticmethod + def configs() -> dict: + return {} + + def enrich(self, to_enrich: Metadata) -> None: + url = to_enrich.get_url() + logger.debug(f"extracting EXIF metadata for {url=}") + + for i, m in enumerate(to_enrich.media): + if len(md := self.get_metadata(m.filename)): + to_enrich.media[i].set("metadata", md) + + def get_metadata(self, filename: str) -> dict: + try: + # Run ExifTool command to extract metadata from the file + cmd = ['exiftool', filename] + result = subprocess.run(cmd, capture_output=True, text=True) + + # Process the output to extract individual metadata fields + metadata = {} + for line in result.stdout.splitlines(): + field, value = line.strip().split(':', 1) + metadata[field.strip()] = value.strip() + return metadata + except FileNotFoundError: + logger.error("[exif_enricher] ExifTool not found. Make sure ExifTool is installed and added to PATH.") + except Exception as e: + logger.error(f"Error occurred: {e}: {traceback.format_exc()}") + return {}