kopia lustrzana https://github.com/bellingcat/auto-archiver
new metadata enricher
rodzic
aa71c85a98
commit
1695954c98
|
@ -9,7 +9,7 @@ RUN pip install --upgrade pip && \
|
||||||
pip install pipenv && \
|
pip install pipenv && \
|
||||||
add-apt-repository ppa:mozillateam/ppa && \
|
add-apt-repository ppa:mozillateam/ppa && \
|
||||||
apt-get update && \
|
apt-get update && \
|
||||||
apt-get install -y gcc ffmpeg fonts-noto && \
|
apt-get install -y gcc ffmpeg fonts-noto exiftool && \
|
||||||
apt-get install -y --no-install-recommends firefox-esr && \
|
apt-get install -y --no-install-recommends firefox-esr && \
|
||||||
ln -s /usr/bin/firefox-esr /usr/bin/firefox && \
|
ln -s /usr/bin/firefox-esr /usr/bin/firefox && \
|
||||||
wget https://github.com/mozilla/geckodriver/releases/download/v0.33.0/geckodriver-v0.33.0-linux64.tar.gz && \
|
wget https://github.com/mozilla/geckodriver/releases/download/v0.33.0/geckodriver-v0.33.0-linux64.tar.gz && \
|
||||||
|
|
|
@ -15,6 +15,7 @@ steps:
|
||||||
# - wacz_archiver_enricher
|
# - wacz_archiver_enricher
|
||||||
enrichers:
|
enrichers:
|
||||||
- hash_enricher
|
- hash_enricher
|
||||||
|
# - metadata_enricher
|
||||||
# - screenshot_enricher
|
# - screenshot_enricher
|
||||||
# - thumbnail_enricher
|
# - thumbnail_enricher
|
||||||
# - wayback_archiver_enricher
|
# - wayback_archiver_enricher
|
||||||
|
|
|
@ -0,0 +1,47 @@
|
||||||
|
import subprocess
|
||||||
|
import traceback
|
||||||
|
from loguru import logger
|
||||||
|
|
||||||
|
from . import Enricher
|
||||||
|
from ..core import Metadata
|
||||||
|
|
||||||
|
|
||||||
|
class MetadataEnricher(Enricher):
|
||||||
|
"""
|
||||||
|
Extracts metadata information from files using exiftool.
|
||||||
|
"""
|
||||||
|
name = "metadata_enricher"
|
||||||
|
|
||||||
|
def __init__(self, config: dict) -> None:
|
||||||
|
# without this STEP.__init__ is not called
|
||||||
|
super().__init__(config)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def configs() -> dict:
|
||||||
|
return {}
|
||||||
|
|
||||||
|
def enrich(self, to_enrich: Metadata) -> None:
|
||||||
|
url = to_enrich.get_url()
|
||||||
|
logger.debug(f"extracting EXIF metadata for {url=}")
|
||||||
|
|
||||||
|
for i, m in enumerate(to_enrich.media):
|
||||||
|
if len(md := self.get_metadata(m.filename)):
|
||||||
|
to_enrich.media[i].set("metadata", md)
|
||||||
|
|
||||||
|
def get_metadata(self, filename: str) -> dict:
|
||||||
|
try:
|
||||||
|
# Run ExifTool command to extract metadata from the file
|
||||||
|
cmd = ['exiftool', filename]
|
||||||
|
result = subprocess.run(cmd, capture_output=True, text=True)
|
||||||
|
|
||||||
|
# Process the output to extract individual metadata fields
|
||||||
|
metadata = {}
|
||||||
|
for line in result.stdout.splitlines():
|
||||||
|
field, value = line.strip().split(':', 1)
|
||||||
|
metadata[field.strip()] = value.strip()
|
||||||
|
return metadata
|
||||||
|
except FileNotFoundError:
|
||||||
|
logger.error("[exif_enricher] ExifTool not found. Make sure ExifTool is installed and added to PATH.")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error occurred: {e}: {traceback.format_exc()}")
|
||||||
|
return {}
|
Ładowanie…
Reference in New Issue