new metadata enricher

2023-07-28 12:46:30 +01:00 · 2023-07-28 12:46:30 +01:00 · 1695954c98
commit 1695954c98
--- a/2
+++ b/2
@ -9,7 +9,7 @@ RUN pip install --upgrade pip && \
 	pip install pipenv && \
 	add-apt-repository ppa:mozillateam/ppa && \
 	apt-get update && \
-	apt-get install -y gcc ffmpeg fonts-noto && \
+	apt-get install -y gcc ffmpeg fonts-noto exiftool && \
 	apt-get install -y --no-install-recommends firefox-esr && \
 	ln -s /usr/bin/firefox-esr /usr/bin/firefox && \
 	wget https://github.com/mozilla/geckodriver/releases/download/v0.33.0/geckodriver-v0.33.0-linux64.tar.gz && \
--- a/example.orchestration.yaml
+++ b/example.orchestration.yaml
@ -15,6 +15,7 @@ steps:
    # - wacz_archiver_enricher
  enrichers:
    - hash_enricher
    # - metadata_enricher
    # - screenshot_enricher
    # - thumbnail_enricher
    # - wayback_archiver_enricher
--- a/src/auto_archiver/enrichers/metadata_enricher.py
+++ b/src/auto_archiver/enrichers/metadata_enricher.py
@ -0,0 +1,47 @@
 import subprocess
 import traceback
 from loguru import logger
 from . import Enricher
 from ..core import Metadata
 class MetadataEnricher(Enricher):
    """
    Extracts metadata information from files using exiftool.
    """
    name = "metadata_enricher"
    def __init__(self, config: dict) -> None:
        # without this STEP.__init__ is not called
        super().__init__(config)
    @staticmethod
    def configs() -> dict:
        return {}
    def enrich(self, to_enrich: Metadata) -> None:
        url = to_enrich.get_url()
        logger.debug(f"extracting EXIF metadata for {url=}")
        for i, m in enumerate(to_enrich.media):
            if len(md := self.get_metadata(m.filename)):
                to_enrich.media[i].set("metadata", md)
    def get_metadata(self, filename: str) -> dict:
        try:
            # Run ExifTool command to extract metadata from the file
            cmd = ['exiftool', filename]
            result = subprocess.run(cmd, capture_output=True, text=True)
            # Process the output to extract individual metadata fields
            metadata = {}
            for line in result.stdout.splitlines():
                field, value = line.strip().split(':', 1)
                metadata[field.strip()] = value.strip()
            return metadata
        except FileNotFoundError:
            logger.error("[exif_enricher] ExifTool not found. Make sure ExifTool is installed and added to PATH.")
        except Exception as e:
            logger.error(f"Error occurred: {e}: {traceback.format_exc()}")
        return {}