diff --git a/src/auto_archiver/core/media.py b/src/auto_archiver/core/media.py index 9f15bdc..772a5a8 100644 --- a/src/auto_archiver/core/media.py +++ b/src/auto_archiver/core/media.py @@ -41,6 +41,9 @@ class Media: if isinstance(prop_media, Media): s.store(prop_media, url) + def is_stored(self) -> bool: + return len(self.urls) > 0 + def set(self, key: str, value: Any) -> Media: self.properties[key] = value return self diff --git a/src/auto_archiver/core/metadata.py b/src/auto_archiver/core/metadata.py index 09791bf..d2e89d4 100644 --- a/src/auto_archiver/core/metadata.py +++ b/src/auto_archiver/core/metadata.py @@ -15,14 +15,12 @@ from .context import ArchivingContext @dataclass class Metadata: status: str = "no archiver" - _processed_at: datetime = field(default_factory=datetime.datetime.utcnow) metadata: Dict[str, Any] = field(default_factory=dict) media: List[Media] = field(default_factory=list) rearchivable: bool = True # defaults to true, archivers can overwrite - # properties below are excluded from JSON representation - tmp_keys: Set[str] = field(default_factory=set, repr=False, metadata=config(exclude=True)) - # tmp_metadata: Dict[str, Any] = field(default_factory=dict, repr=False, metadata=config(exclude=True)) # contains internal properties not to be leaked when .to_json/repr/str is called + def __post_init__(self): + self.set("_processed_at", datetime.datetime.utcnow()) def merge(self: Metadata, right: Metadata, overwrite_left=True) -> Metadata: """ @@ -33,7 +31,6 @@ class Metadata: if right.status and len(right.status): self.status = right.status self.rearchivable |= right.rearchivable - self.tmp_keys |= right.tmp_keys for k, v in right.metadata.items(): assert k not in self.metadata or type(v) == type(self.get(k)) if type(v) not in [dict, list, set] or k not in self.metadata: @@ -52,10 +49,8 @@ class Metadata: for media in self.media: media.store(override_storages=storages) - def set(self, key: str, val: Any, is_tmp=False) -> Metadata: - # if not self.metadata: self.metadata = {} + def set(self, key: str, val: Any) -> Metadata: self.metadata[key] = val - if is_tmp: self.tmp_keys.add(key) return self def get(self, key: str, default: Any = None, create_if_missing=False) -> Union[Metadata, str]: @@ -73,7 +68,7 @@ class Metadata: return "success" in self.status def is_empty(self) -> bool: - return not self.is_success() and len(self.media) == 0 and len(self.get_clean_metadata()) <= 2 # url, processed_at + return not self.is_success() and len(self.media) == 0 and len(self.metadata) <= 2 # url, processed_at @property # getter .netloc def netloc(self) -> str: @@ -142,11 +137,5 @@ class Metadata: _default = self.media[0] if len(self.media) else None return self.get_media_by_id("_final_media", _default) - def get_clean_metadata(self) -> Metadata: - return dict( - {k: v for k, v in self.metadata.items() if k not in self.tmp_keys}, - **{"processed_at": self._processed_at} - ) - def __str__(self) -> str: return self.__repr__() diff --git a/src/auto_archiver/enrichers/__init__.py b/src/auto_archiver/enrichers/__init__.py index fe9cc68..d33b49d 100644 --- a/src/auto_archiver/enrichers/__init__.py +++ b/src/auto_archiver/enrichers/__init__.py @@ -3,4 +3,5 @@ from .screenshot_enricher import ScreenshotEnricher from .wayback_enricher import WaybackArchiverEnricher from .hash_enricher import HashEnricher from .thumbnail_enricher import ThumbnailEnricher -from .wacz_enricher import WaczEnricher \ No newline at end of file +from .wacz_enricher import WaczEnricher +from .whisper_enricher import WhisperEnricher \ No newline at end of file diff --git a/src/auto_archiver/formatters/html_formatter.py b/src/auto_archiver/formatters/html_formatter.py index 80722d3..f30156a 100644 --- a/src/auto_archiver/formatters/html_formatter.py +++ b/src/auto_archiver/formatters/html_formatter.py @@ -40,7 +40,7 @@ class HtmlFormatter(Formatter): url=url, title=item.get_title(), media=item.media, - metadata=item.get_clean_metadata(), + metadata=item.metadata, version=__version__ ) html_path = os.path.join(ArchivingContext.get_tmp_dir(), f"formatted{str(uuid.uuid4())}.html") diff --git a/src/auto_archiver/storages/storage.py b/src/auto_archiver/storages/storage.py index add7242..ce99042 100644 --- a/src/auto_archiver/storages/storage.py +++ b/src/auto_archiver/storages/storage.py @@ -42,10 +42,12 @@ class Storage(Step): return Step.init(name, config, Storage) def store(self, media: Media, url: str) -> None: + if media.is_stored(): + logger.debug(f"{self.key} already stored, skipping") + return self.set_key(media, url) self.upload(media) media.add_url(self.get_cdn_url(media)) - media. @abstractmethod def get_cdn_url(self, media: Media) -> str: pass