kopia lustrzana https://github.com/bellingcat/auto-archiver
new metadata enricher
rodzic
aa71c85a98
commit
1695954c98
|
@ -9,7 +9,7 @@ RUN pip install --upgrade pip && \
|
|||
pip install pipenv && \
|
||||
add-apt-repository ppa:mozillateam/ppa && \
|
||||
apt-get update && \
|
||||
apt-get install -y gcc ffmpeg fonts-noto && \
|
||||
apt-get install -y gcc ffmpeg fonts-noto exiftool && \
|
||||
apt-get install -y --no-install-recommends firefox-esr && \
|
||||
ln -s /usr/bin/firefox-esr /usr/bin/firefox && \
|
||||
wget https://github.com/mozilla/geckodriver/releases/download/v0.33.0/geckodriver-v0.33.0-linux64.tar.gz && \
|
||||
|
|
|
@ -15,6 +15,7 @@ steps:
|
|||
# - wacz_archiver_enricher
|
||||
enrichers:
|
||||
- hash_enricher
|
||||
# - metadata_enricher
|
||||
# - screenshot_enricher
|
||||
# - thumbnail_enricher
|
||||
# - wayback_archiver_enricher
|
||||
|
|
|
@ -0,0 +1,47 @@
|
|||
import subprocess
|
||||
import traceback
|
||||
from loguru import logger
|
||||
|
||||
from . import Enricher
|
||||
from ..core import Metadata
|
||||
|
||||
|
||||
class MetadataEnricher(Enricher):
|
||||
"""
|
||||
Extracts metadata information from files using exiftool.
|
||||
"""
|
||||
name = "metadata_enricher"
|
||||
|
||||
def __init__(self, config: dict) -> None:
|
||||
# without this STEP.__init__ is not called
|
||||
super().__init__(config)
|
||||
|
||||
@staticmethod
|
||||
def configs() -> dict:
|
||||
return {}
|
||||
|
||||
def enrich(self, to_enrich: Metadata) -> None:
|
||||
url = to_enrich.get_url()
|
||||
logger.debug(f"extracting EXIF metadata for {url=}")
|
||||
|
||||
for i, m in enumerate(to_enrich.media):
|
||||
if len(md := self.get_metadata(m.filename)):
|
||||
to_enrich.media[i].set("metadata", md)
|
||||
|
||||
def get_metadata(self, filename: str) -> dict:
|
||||
try:
|
||||
# Run ExifTool command to extract metadata from the file
|
||||
cmd = ['exiftool', filename]
|
||||
result = subprocess.run(cmd, capture_output=True, text=True)
|
||||
|
||||
# Process the output to extract individual metadata fields
|
||||
metadata = {}
|
||||
for line in result.stdout.splitlines():
|
||||
field, value = line.strip().split(':', 1)
|
||||
metadata[field.strip()] = value.strip()
|
||||
return metadata
|
||||
except FileNotFoundError:
|
||||
logger.error("[exif_enricher] ExifTool not found. Make sure ExifTool is installed and added to PATH.")
|
||||
except Exception as e:
|
||||
logger.error(f"Error occurred: {e}: {traceback.format_exc()}")
|
||||
return {}
|
Ładowanie…
Reference in New Issue