From 7a5c9c65bde141aae6a7fbde12c2a4ec4a989ca0 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Fri, 28 Jul 2023 10:51:48 +0100 Subject: [PATCH] detects duplicates before storing, eg: wacz getting media already fetched by another archiver --- src/auto_archiver/core/metadata.py | 25 +++++++++++++++++++- src/auto_archiver/enrichers/hash_enricher.py | 2 +- 2 files changed, 25 insertions(+), 2 deletions(-) diff --git a/src/auto_archiver/core/metadata.py b/src/auto_archiver/core/metadata.py index 56bde98..80b596d 100644 --- a/src/auto_archiver/core/metadata.py +++ b/src/auto_archiver/core/metadata.py @@ -1,5 +1,6 @@ from __future__ import annotations +import hashlib from typing import Any, List, Union, Dict from dataclasses import dataclass, field from dataclasses_json import dataclass_json, config @@ -42,6 +43,7 @@ class Metadata: def store(self: Metadata, override_storages: List = None): # calls .store for all contained media. storages [Storage] + self.remove_duplicate_media_by_hash() storages = override_storages or ArchivingContext.get("storages") for media in self.media: media.store(override_storages=storages, url=self.get_url()) @@ -122,6 +124,27 @@ class Metadata: if m.get("id") == id: return m return default + def remove_duplicate_media_by_hash(self) -> None: + # iterates all media, calculates a hash if it's missing and deletes duplicates + def calculate_hash_in_chunks(hash_algo, chunksize, filename) -> str: + # taken from hash_enricher, cannot be isolated to misc due to circular imports + with open(filename, "rb") as f: + while True: + buf = f.read(chunksize) + if not buf: break + hash_algo.update(buf) + return hash_algo.hexdigest() + + media_hashes = set() + new_media = [] + for m in self.media: + h = m.get("hash") + if not h: h = calculate_hash_in_chunks(hashlib.sha256(), 1.6e7, m.filename) + if len(h) and h in media_hashes: continue + media_hashes.add(h) + new_media.append(m) + self.media = new_media + def get_first_image(self, default=None) -> Media: for m in self.media: if "image" in m.mimetype: return m @@ -134,7 +157,7 @@ class Metadata: def get_final_media(self) -> Media: _default = self.media[0] if len(self.media) else None return self.get_media_by_id("_final_media", _default) - + def get_all_media(self) -> List[Media]: # returns a list with all the media and inner media return [inner for m in self.media for inner in m.all_inner_media(True)] diff --git a/src/auto_archiver/enrichers/hash_enricher.py b/src/auto_archiver/enrichers/hash_enricher.py index 7bf8f89..2321d94 100644 --- a/src/auto_archiver/enrichers/hash_enricher.py +++ b/src/auto_archiver/enrichers/hash_enricher.py @@ -34,7 +34,7 @@ class HashEnricher(Enricher): if len(hd := self.calculate_hash(m.filename)): to_enrich.media[i].set("hash", f"{self.algorithm}:{hd}") - def calculate_hash(self, filename): + def calculate_hash(self, filename) -> str: hash = None if self.algorithm == "SHA-256": hash = hashlib.sha256()