kopia lustrzana https://github.com/bellingcat/auto-archiver
59 wiersze
2.0 KiB
Python
59 wiersze
2.0 KiB
Python
import datetime
|
|
import os
|
|
from loguru import logger
|
|
|
|
from . import Enricher
|
|
from ..core import Metadata
|
|
|
|
|
|
class MetaEnricher(Enricher):
|
|
"""
|
|
Adds metadata information about the archive operations, to be included at the end of all enrichments
|
|
"""
|
|
name = "meta_enricher"
|
|
|
|
|
|
def __init__(self, config: dict) -> None:
|
|
# without this STEP.__init__ is not called
|
|
super().__init__(config)
|
|
|
|
@staticmethod
|
|
def configs() -> dict:
|
|
return {}
|
|
|
|
def enrich(self, to_enrich: Metadata) -> None:
|
|
url = to_enrich.get_url()
|
|
if to_enrich.is_empty():
|
|
logger.debug(f"[SKIP] META_ENRICHER there is no media or metadata to enrich: {url=}")
|
|
return
|
|
|
|
logger.debug(f"calculating archive metadata information for {url=}")
|
|
|
|
self.enrich_file_sizes(to_enrich)
|
|
self.enrich_archive_duration(to_enrich)
|
|
|
|
def enrich_file_sizes(self, to_enrich: Metadata):
|
|
logger.debug(f"calculating archive file sizes for url={to_enrich.get_url()} ({len(to_enrich.media)} media files)")
|
|
total_size = 0
|
|
for media in to_enrich.get_all_media():
|
|
file_stats = os.stat(media.filename)
|
|
media.set("bytes", file_stats.st_size)
|
|
media.set("size", self.human_readable_bytes(file_stats.st_size))
|
|
total_size += file_stats.st_size
|
|
|
|
to_enrich.set("total_bytes", total_size)
|
|
to_enrich.set("total_size", self.human_readable_bytes(total_size))
|
|
|
|
|
|
def human_readable_bytes(self, size: int) -> str:
|
|
# receives number of bytes and returns human readble size
|
|
for unit in ["bytes", "KB", "MB", "GB", "TB"]:
|
|
if size < 1024:
|
|
return f"{size:.1f} {unit}"
|
|
size /= 1024
|
|
|
|
def enrich_archive_duration(self, to_enrich):
|
|
logger.debug(f"calculating archive duration for url={to_enrich.get_url()} ")
|
|
|
|
archive_duration = datetime.datetime.utcnow() - to_enrich.get("_processed_at")
|
|
to_enrich.set("archive_duration_seconds", archive_duration.seconds) |